def build_ordinal_regression_input():
    _, scores_train, texts_train = load_SemEval("./resources/full_tweets/train_gold.tsv")
    _, scores_dev, texts_dev = load_SemEval("./resources/full_tweets/dev_gold.tsv")
    _, scores_devtest, texts_devtest = load_SemEval("./resources/full_tweets/devtest_gold.tsv")

    _, scores_old, texts_old = load_SemEval("./resources/full_tweets/old_data.tsv")

    ids, topics, texts = load_SemEval_test(
        './resources/TEST data/SemEval2016_Task4_test_datasets/SemEval2016-task4-test.subtask-BCDE.txt')

    test = [ids, topics, texts]

    scores_train = scores_train + [i - 3 for i in scores_old]  # from [1, 5] to [-2, 2]
    texts_train = texts_train + texts_old

    # Use additional data
    from additional_data import additional

    additional_text, additional_scores = additional()

    scores_train = scores_train + additional_scores
    texts_train = texts_train + additional_text

    keys = ["train", "dev", "devtest"]
    texts, scores = dict(), dict()

    texts[keys[0]], scores[keys[0]] = remove_unavailable(texts_train, scores_train)
    texts[keys[1]], scores[keys[1]] = remove_unavailable(texts_dev, scores_dev)
    texts[keys[2]], scores[keys[2]] = remove_unavailable(texts_devtest, scores_devtest)

    data, W, _ = build_keras_input(texts, scores, test, new=True)
    exit()
    '''
Example #2
0
def testdata_statistics():
    filename = "./resources/TEST data/SemEval2016_Task4_test_datasets/SemEval2016-task4-test.subtask-BCDE.txt"
    topics, scores, texts = load_SemEval(filename)
    SemEval_statistics(topics, scores, texts)
def testdata_statistics():
    filename = "./resources/TEST data/SemEval2016_Task4_test_datasets/SemEval2016-task4-test.subtask-BCDE.txt"
    topics, scores, texts = load_SemEval(filename)
    SemEval_statistics(topics, scores, texts)
Example #4
0
    print("Common terms between dev and devtest set, nb: %s, terms: %s" %
          (len(set(Devtest) & set(Dev)), sorted(set(Devtest) & set(Dev))))


def testdata_statistics():
    filename = "./resources/TEST data/SemEval2016_Task4_test_datasets/SemEval2016-task4-test.subtask-BCDE.txt"
    topics, scores, texts = load_SemEval(filename)
    SemEval_statistics(topics, scores, texts)


if __name__ == "__main__":
    testdata_statistics()
    exit()
    filenames = [
        "dev_gold.tsv", "devtest_gold.tsv", "devtest_input.tsv",
        "train_gold.tsv"
    ]
    file_dir = "./resources/full_tweets/"

    for filename in filenames:
        print(
            "------------------------- Filename: %s -------------------------"
            % filename)
        filename = file_dir + filename
        if "devtest_input.tsv" in filename:
            topics, scores, texts = load_SemEval(filename, "input")
        else:
            topics, scores, texts = load_SemEval(filename)
        SemEval_statistics(topics, scores, texts)

    common_topics()
def common_topics():
    Train = ['@microsoft', 'ac/dc', 'amazon', 'amazon prime', 'amazon prime day', 'angela merkel', 'apple', 'apple watch', 'arsenal', 'barca', 'batman', 'bbc', 'bentley', 'bernie sanders', 'beyonce', 'bob marley', 'bobby jindal', 'chelsea', 'chris brown', 'conor mcgregor', 'david beckham', 'david cameron', 'digi', 'disneyland', 'donald trump', 'erdogan', 'eric church', 'federer', 'fleetwood mac', 'galaxy note', 'game of thrones', 'google', 'google+', 'grateful dead', 'hannibal', 'harper', 'harry potter', 'hillary', 'ibm', 'ihop', 'ios', 'ipad', 'iphone', 'ipod', 'jay-z', 'jeb bush', 'joe biden', 'jurassic park', 'jurassic world', 'justin', 'juventus', 'kerry', 'kurt cobain', 'labor day', 'lexus', 'madonna', 'magic mike xxl', 'mariah carey', 'messi', 'metlife']
    Dev = ['michael jackson', 'michelle obama', 'minecraft', 'monsanto', 'netflix', 'nike', 'nintendo', 'nokia', 'obama', 'oracle', 'planned parenthood', 'pope', 'pride parade', 'ric flair', 'rick perry', 'sarah palin', 'scotus', 'seinfeld', 'serena', 'snoop dogg']
    Devtest = ['sony', 'star wars', 'sting', 't-mobile', 'taylor swift', 'ted 2', 'teen wolf', 'tgif', 'tiger woods', 'tom cruise', 'tory', 'trump', 'tsipras', 'ukip', 'valentine', "valentine 's day", 'venice beach', 'windows 10', 'xbox', 'zlatan']

    print("Common terms between train and dev set, nb: %s, terms: %s"%(len(set(Train) & set(Dev)), sorted(set(Train) & set(Dev))))
    print("Common terms between train and devtest set, nb: %s, terms: %s"%(len(set(Train) & set(Devtest)), sorted(set(Train) & set(Devtest))))
    print("Common terms between dev and devtest set, nb: %s, terms: %s"%(len(set(Devtest) & set(Dev)), sorted(set(Devtest) & set(Dev))))

def testdata_statistics():
    filename = "./resources/TEST data/SemEval2016_Task4_test_datasets/SemEval2016-task4-test.subtask-BCDE.txt"
    topics, scores, texts = load_SemEval(filename)
    SemEval_statistics(topics, scores, texts)


if __name__ == "__main__":
    testdata_statistics()
    exit()
    filenames = ["dev_gold.tsv", "devtest_gold.tsv", "devtest_input.tsv","train_gold.tsv"]
    file_dir = "./resources/full_tweets/"

    for filename in filenames:
        print("------------------------- Filename: %s -------------------------" % filename)
        filename = file_dir + filename
        if "devtest_input.tsv" in filename:
            topics, scores, texts = load_SemEval(filename, "input")
        else:
            topics, scores, texts = load_SemEval(filename)
        SemEval_statistics(topics, scores, texts)

    common_topics()