Exemple #1
0
def task22Submission_LR(parameters, write=False, output_dir=None):
    print(parameters)

    dataset = 'test'
    df_out = task_to_df(22, dataset)
    txtfile = 'semeval_data/{}/task-2.2.txt'.format(dataset)
    method, _ = parameters
    train_df = task_to_df(22, 'dev')
    train_y = train_df['gold_sense_id']
    train_X = vectorize(method, task=22, dataset='dev')
    test_X = vectorize(method, task=22, dataset='test')

    clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
    clf.fit(train_X, train_y)
    T = clf.predict_proba(test_X)
    confidence = np.amax(T, axis=1)

    y_pred = clf.predict(test_X)
    df_out['predict_sense_id'] = y_pred
    # ---------------------------------------
    if write == True:
        # csvfile = '{}/task22_{}_#{}.csv'.format(output_dir, method, ('LogisticRegression'))
        # df_to_csv(test_df, csvfile)
        # print(method + ' csv file is written')
        task22_toTXT(txtfile, df_out, output_dir)
        print('Task-22 txt file is written')
    # ---------------------------------------
    return df_out, confidence
Exemple #2
0
def task1Submission(parameters, write=False, output_dir=None):
    print('\nTask-1 submission----------\n')
    print(parameters)

    dataset = 'test'
    df_out = task_to_df(1, dataset)
    txtfile = 'semeval_data/{}/task-1.txt'.format(dataset)

    method, norm, affinity, linkage, n_cluster = parameters

    clusterizer = AgglomerativeClustering(n_clusters=n_cluster,
                                          affinity=affinity,
                                          linkage=linkage)
    vecs = vectorize(method, task=1, dataset='test')

    if norm == True:
        method = method + '-norm'
        vecs = sklearn.preprocessing.normalize(vecs, norm='l2', axis=1)

    df_out['predict_sense_id'] = clusterizer.fit_predict(vecs)

    if write == True:
        # csvfile = '{}/task1_{}_#{}#{}#{}.csv'.format(output_dir, method, affinity, linkage, n_cluster)
        # df_to_csv(df_out, csvfile)
        # print(method + ' csv file is written')
        task1_toTXT(txtfile, df_out, output_dir)
        print('Task-1 txt file is written')

    return df_out
Exemple #3
0
def get_elmo_context_embeddings(task, dataset, ly='default'):

    df = task_to_df(task, dataset)

    sentences = list(df['context'])

    N = len(df)
    print('Total Records--{}'.format(N))

    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

    BATCH = 1000
    embeddings_elmo = np.array([])
    for i in range(0, N, BATCH):
        if ly == 'default':
            tensors = elmo(sentences[i:i + BATCH],
                           signature="default",
                           as_dict=True)[ly]

        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            embeddings = session.run(tensors)

        print('{}-{} records processed...'.format(i, i + BATCH))

        if i == 0:
            embeddings_elmo = embeddings
        else:
            embeddings_elmo = np.vstack((embeddings_elmo, embeddings))

    logging.info('{} layer shape--{}'.format(ly, embeddings_elmo.shape))
    return embeddings_elmo
def verb_embeddings(task, dataset, model, emb='verb'):

    dir = 'vectors'
    file = '{}/{}/task{}_{}_{}.npy'.format(dir, dataset, task, model, emb)
    if Path(file).exists():
        return np.load(file)
    else:
        df = task_to_df(task, dataset)
        if model == 'w2v':
            return get_w2v_word_embeddings(df['verb'])
        else:
            return get_elmo_word_embeddings(df['verb'])
def write_vectors():
    dir = 'vectors'

    tasks = ['1', '22']
    datasets = ['dev', 'test']

    for task in tasks:
        for dataset in datasets:

            pathlib.Path('{}/{}'.format(dir, dataset)).mkdir(parents=True,
                                                             exist_ok=True)

            print(dataset, 'task', task)
            df = task_to_df(task, dataset)

            w2v_context = get_w2v_context_embeddings_Default(task, dataset)
            outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task,
                                                   'w2v_context')
            np.save(outfile, w2v_context)

            elmo_word = get_elmo_word_embeddings(df['word'])
            outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task,
                                                   'elmo_word')
            np.save(outfile, elmo_word)

            elmo_context = load_elmo_context_embeddings(task, dataset)
            outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task,
                                                   'elmo_context')
            np.save(outfile, elmo_context)

            if task == '1':
                w2v_word = get_w2v_word_embeddings(df['word'])
                outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task,
                                                       'w2v_word')
                np.save(outfile, w2v_word)

            else:
                w2v_word = get_w2v_multiword_embeddings(df['word'])
                outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task,
                                                       'w2v_word')
                np.save(outfile, w2v_word)

                w2v_verb = get_elmo_word_embeddings(df['verb'])
                outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task,
                                                       'w2v_verb')
                np.save(outfile, w2v_verb)

                elmo_verb = get_elmo_word_embeddings(df['verb'])
                outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task,
                                                       'elmo_verb')
                np.save(outfile, elmo_verb)
Exemple #6
0
def get_w2v_context_embeddings_Default(task, dataset):
    w2v_file = './input/models/GoogleNews-vectors-negative300.bin'
    tfidf_file = './input/models/sentences_dev.txt'
    df = task_to_df(task, dataset)
    return get_w2v_context_embeddings(df, w2v_file, tfidf_file=tfidf_file)