def run(test_file, sense, context, output, wsd_method='sim', filter_ctx=2, lowercase=False, ignore_case=False): print("Loading models...") vs = SenseGram.load_word2vec_format(sense, binary=True) vc = word2vec.Word2Vec.load_word2vec_format(context, binary=True) wsd_model = WSD(vs, vc, method=wsd_method, filter_ctx=filter_ctx, ignore_case=ignore_case) print("Loading test set...") reader = read_csv(test_file, encoding="utf-8", delimiter="\t", dtype={ 'predict_related': object, 'gold_sense_ids': object, 'predict_sense_ids': object }) rows_count = reader.shape[0] print((str(rows_count) + " test instances")) pb = pbar.Pbar(rows_count, 100) uncovered_words = [] # target words for which sense model has zero senses print(("Start prediction over " + test_file)) pb.start() for i, row in reader.iterrows(): # Form of prediction: (sense, sense_scores) ctx = row.context.lower() if lowercase else row.context start, end = [int(x) for x in row.target_position.split(',')] prediction = wsd_model.dis_text(ctx, row.target, start, end) if prediction: sense, sense_scores = prediction reader.set_value(i, 'predict_sense_ids', sense.split("#")[1]) #neighbours = wsd_model.vs.most_similar(sense, topn=n_neighbours) #neighbours = ["%s:%.3f" % (n.split("#")[0], float(sim)) for n, sim in neighbours] #reader.set_value(i, 'predict_related', ",".join(neighbours)) else: uncovered_words.append(row.target) continue pb.update(i) pb.finish() reader.to_csv(sep='\t', path_or_buf=output, encoding="utf-8", index=False, quoting=QUOTE_NONE) print(("Saved predictions to " + output))
def run(clusters, model, n, output, method='weighted', has_header=True): print("Loading original context model...") contextvec = word2vec.Word2Vec.load_word2vec_format(model, binary=False) print("Initializing new word model...") wordvec = initialize(clusters, has_header, contextvec.syn0.shape[1]) print("Pooling cluster vectors (%s method)..." % method) reader = read_clusetrs_file(clusters, has_header) pb = pbar.Pbar(wordvec.syn0.shape[0], 100) pb.start() i = 0 for chunk in reader: if debug: print("Column types: %s" % chunk.dtypes) for j, row in chunk.iterrows(): row_word = row.word row_cluster = row.cluster # process new word word_cluster = parse_cluster(row_cluster, contextvec)[:n] vectors = np.array( [contextvec[context] for context, sim in word_cluster]) sims = np.array([float(sim) for context, sim in word_cluster]) word_vector = pool_vectors(vectors, sims, method) if row_word not in wordvec.vocab: wordvec.add_word(row_word, word_vector) pb.update(i) i += 1 pb.finish() ##### Validation ##### if wordvec.syn0.shape[0] != len(wordvec.vocab): print("Shrinking matrix size from %i to %i" % (wordvec.syn0.shape[0], len(wordvec.vocab))) wordvec.syn0 = np.ascontiguousarray(wordvec.syn0[:len(wordvec.vocab)]) print("Sense vectors saved to: " + output) wordvec.save_word2vec_format(fname=output, binary=True)
def run(clusters, model, output, method='weighted', lowercase=False, inventory=None, has_header=True): small_clusters = 0 sen_count = defaultdict(int) # number of senses per word cluster_sum = defaultdict(int) # number of cluster words per word print("Loading original word model...") wordvec = word2vec.Word2Vec.load_word2vec_format(model, binary=True) print("Initializing sense model...") senvec = initialize(clusters, has_header, wordvec.syn0.shape[1]) print("Pooling cluster vectors (%s method)..." % method) reader = read_clusetrs_file(clusters, has_header) pb = pbar.Pbar(senvec.syn0.shape[0], 100) pb.start() with write_inventory(inventory) as inv_output: inv_output.write(inventory_header) i = 0 for chunk in reader: if debug: print("Column types: %s" % chunk.dtypes) for j, row in chunk.iterrows(): row_word = row.word row_cluster = row.cluster if lowercase: row_cluster = row_cluster.lower() # enumerate word senses from 0 sen_word = unicode(row_word) + sen_delimiter + unicode( sen_count[row_word]) # process new sense sen_cluster = parse_cluster(row_cluster, wordvec) if len(sen_cluster) >= 5: vectors = np.array( [wordvec[word] for word, sim in sen_cluster]) sims = np.array([float(sim) for word, sim in sen_cluster]) sen_vector = pool_vectors(vectors, sims, method) if sen_word not in senvec.vocab: senvec.add_word(sen_word, sen_vector) senvec.probs[sen_word] = len( sen_cluster) # number of cluster words per sense sen_count[row_word] += 1 # number of senses per word cluster_sum[row_word] += len( sen_cluster) # number of cluster words per word # write new sense to sense inventory if inventory: # join back cluster words (only those that were actually used for sense vector) cluster = ",".join( [word + ":" + sim for word, sim in sen_cluster]) inv_output.write( u"%s\t%s\t%s\n" % (sen_word.split(sen_delimiter)[0], sen_word.split(sen_delimiter)[1], cluster)) else: small_clusters += 1 if debug: print row_word, "\t", row.cid print sen_cluster pb.update(i) i += 1 senvec.__normalize_probs__(cluster_sum) pb.finish() ##### Validation ##### if senvec.syn0.shape[0] != len(senvec.vocab): print("Shrinking matrix size from %i to %i" % (senvec.syn0.shape[0], len(senvec.vocab))) senvec.syn0 = np.ascontiguousarray(senvec.syn0[:len(senvec.vocab)]) print("Sense vectors saved to: " + output) senvec.save_word2vec_format(fname=output, binary=True)