def task22Submission_LR(parameters, write=False, output_dir=None): print(parameters) dataset = 'test' df_out = task_to_df(22, dataset) txtfile = 'semeval_data/{}/task-2.2.txt'.format(dataset) method, _ = parameters train_df = task_to_df(22, 'dev') train_y = train_df['gold_sense_id'] train_X = vectorize(method, task=22, dataset='dev') test_X = vectorize(method, task=22, dataset='test') clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000) clf.fit(train_X, train_y) T = clf.predict_proba(test_X) confidence = np.amax(T, axis=1) y_pred = clf.predict(test_X) df_out['predict_sense_id'] = y_pred # --------------------------------------- if write == True: # csvfile = '{}/task22_{}_#{}.csv'.format(output_dir, method, ('LogisticRegression')) # df_to_csv(test_df, csvfile) # print(method + ' csv file is written') task22_toTXT(txtfile, df_out, output_dir) print('Task-22 txt file is written') # --------------------------------------- return df_out, confidence
def task1Submission(parameters, write=False, output_dir=None): print('\nTask-1 submission----------\n') print(parameters) dataset = 'test' df_out = task_to_df(1, dataset) txtfile = 'semeval_data/{}/task-1.txt'.format(dataset) method, norm, affinity, linkage, n_cluster = parameters clusterizer = AgglomerativeClustering(n_clusters=n_cluster, affinity=affinity, linkage=linkage) vecs = vectorize(method, task=1, dataset='test') if norm == True: method = method + '-norm' vecs = sklearn.preprocessing.normalize(vecs, norm='l2', axis=1) df_out['predict_sense_id'] = clusterizer.fit_predict(vecs) if write == True: # csvfile = '{}/task1_{}_#{}#{}#{}.csv'.format(output_dir, method, affinity, linkage, n_cluster) # df_to_csv(df_out, csvfile) # print(method + ' csv file is written') task1_toTXT(txtfile, df_out, output_dir) print('Task-1 txt file is written') return df_out
def get_elmo_context_embeddings(task, dataset, ly='default'): df = task_to_df(task, dataset) sentences = list(df['context']) N = len(df) print('Total Records--{}'.format(N)) elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) BATCH = 1000 embeddings_elmo = np.array([]) for i in range(0, N, BATCH): if ly == 'default': tensors = elmo(sentences[i:i + BATCH], signature="default", as_dict=True)[ly] with tf.Session() as session: session.run(tf.global_variables_initializer()) embeddings = session.run(tensors) print('{}-{} records processed...'.format(i, i + BATCH)) if i == 0: embeddings_elmo = embeddings else: embeddings_elmo = np.vstack((embeddings_elmo, embeddings)) logging.info('{} layer shape--{}'.format(ly, embeddings_elmo.shape)) return embeddings_elmo
def verb_embeddings(task, dataset, model, emb='verb'): dir = 'vectors' file = '{}/{}/task{}_{}_{}.npy'.format(dir, dataset, task, model, emb) if Path(file).exists(): return np.load(file) else: df = task_to_df(task, dataset) if model == 'w2v': return get_w2v_word_embeddings(df['verb']) else: return get_elmo_word_embeddings(df['verb'])
def write_vectors(): dir = 'vectors' tasks = ['1', '22'] datasets = ['dev', 'test'] for task in tasks: for dataset in datasets: pathlib.Path('{}/{}'.format(dir, dataset)).mkdir(parents=True, exist_ok=True) print(dataset, 'task', task) df = task_to_df(task, dataset) w2v_context = get_w2v_context_embeddings_Default(task, dataset) outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task, 'w2v_context') np.save(outfile, w2v_context) elmo_word = get_elmo_word_embeddings(df['word']) outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task, 'elmo_word') np.save(outfile, elmo_word) elmo_context = load_elmo_context_embeddings(task, dataset) outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task, 'elmo_context') np.save(outfile, elmo_context) if task == '1': w2v_word = get_w2v_word_embeddings(df['word']) outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task, 'w2v_word') np.save(outfile, w2v_word) else: w2v_word = get_w2v_multiword_embeddings(df['word']) outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task, 'w2v_word') np.save(outfile, w2v_word) w2v_verb = get_elmo_word_embeddings(df['verb']) outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task, 'w2v_verb') np.save(outfile, w2v_verb) elmo_verb = get_elmo_word_embeddings(df['verb']) outfile = '{}/{}/task{}_{}.npy'.format(dir, dataset, task, 'elmo_verb') np.save(outfile, elmo_verb)
def get_w2v_context_embeddings_Default(task, dataset): w2v_file = './input/models/GoogleNews-vectors-negative300.bin' tfidf_file = './input/models/sentences_dev.txt' df = task_to_df(task, dataset) return get_w2v_context_embeddings(df, w2v_file, tfidf_file=tfidf_file)