def get_neighbors(vector, space, n_neighbors=5, pos=None): if pos is not None: space = space_pos_filter(space, pos) targets = space.id2row if n_neighbors is None: n_neighbors = len(targets) n_neighbors = min(n_neighbors, len(targets)) sims_to_matrix = CosSimilarity().get_sims_to_matrix(vector, space.cooccurrence_matrix) sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1) return [(space.id2row[i], sims_to_matrix[i, 0]) for i in sorted_perm[:n_neighbors]]
class MixedCompositionalTreeKernel(SyntacticTreeKernel): ''' Mixed Salad Kernel 2 variation ''' kernel_name = "mixed_salad_kernel22" def __init__(self, lambda_): ''' Constructor ''' self._lambda = lambda_ self._measure = CosSimilarity() # default one def dot_product(self, tree1, tree2): assert_type(tree1, SemanticTree) assert_type(tree2, SemanticTree) return super(MixedCompositionalTreeKernel, self).dot_product(tree1, tree2) # new delta def _delta(self, node1, node2, node2id1, node2id2, delta_matrix): if (node1.is_terminal() and node2.is_terminal() and node1._label == node2._label and node1._word == node2._word): delta_matrix[node2id1[node1],node2id2[node2]] = 1 elif not node1.has_same_production(node2): if node1._label != node2._label: delta_matrix[node2id1[node1],node2id2[node2]] = 0 else: delta_matrix[node2id1[node1],node2id2[node2]] = self._measure.get_sim(node1._vector, node2._vector) else: product_children_delta = self._lambda for i in xrange(len(node1._children)): child1 = node1.get_child(i) child2 = node2.get_child(i) child_delta = delta_matrix[node2id1[child1],node2id2[child2]] if child_delta == -1: raise ValueError("???") else: product_children_delta *= (1 + child_delta) sim_children_product = 1 for i in xrange(len(node1._children)): child1 = node1.get_child(i) child2 = node2.get_child(i) sim_children_product *= self._measure.get_sim(child1._vector, child2._vector) final_delta = (product_children_delta + (self._measure.get_sim(node1._vector, node2._vector) - self._lambda * sim_children_product)) delta_matrix[node2id1[node1],node2id2[node2]] = final_delta
class NaiveCompositionalSemanticTreeKernel(SyntacticTreeKernel): """ Mixed Salad Kernel 1 """ kernel_name = "mixed_salad_kernel1" NO_COMPATIBILITY = 0 LABEL_COMPATIBILITY = 1 def __init__(self, lambda_, compatibility_level=LABEL_COMPATIBILITY): """ Constructor """ self._lambda = lambda_ self._compatibility_level = compatibility_level self._measure = CosSimilarity() def dot_product(self, tree1, tree2): assert_type(tree1, SemanticTree) assert_type(tree2, SemanticTree) return super(NaiveCompositionalSemanticTreeKernel, self).dot_product(tree1, tree2) def _delta(self, node1, node2, node2id1, node2id2, delta_matrix): delta = 0 if self._compatibility_level == NaiveCompositionalSemanticTreeKernel.NO_COMPATIBILITY or ( self._compatibility_level == NaiveCompositionalSemanticTreeKernel.LABEL_COMPATIBILITY and node1._label == node2._label ): delta = (self._lambda ** (node1.get_height() + node2.get_height())) * self._measure.get_sim( node1._vector, node2._vector ) delta_matrix[node2id1[node1], node2id2[node2]] = delta
def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files): sim_dict = { "cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity() } if not sim_measure in sim_dict: raise ValueError("Similarity measure:%s not defined" % sim_measure) space = io_utils.load(space_files[0], Space) space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) sim = sim_dict[sim_measure] descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr]) out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) data = io_utils.read_list(in_file) print("Computing neighbours: %s" % sim_measure) with open(out_file, "w") as out_stream: for word in data: out_stream.write("%s\n" % word) result = space.get_neighbours(word, no_neighbours, sim, space2) for neighbour, neighbour_sim in result: out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
def __init__(self, lambda_, compatibility_level=LABEL_COMPATIBILITY): """ Constructor """ self._lambda = lambda_ self._compatibility_level = compatibility_level self._measure = CosSimilarity()
def functionneighbours(words,number): #load a space if sys.argv[2]=='full': my_space = io_utils.load("./data/out/thesisfull.pkl") if sys.argv[2]=='nonzero': my_space = io_utils.load("./data/out/thesis.pkl") return(my_space.get_neighbours(words,number, CosSimilarity()))
def __init__(self, similarity=None): ''' Constructor ''' if similarity is None: self._similarity = CosSimilarity() else: self._similarity = similarity
def main(): """ cosWeeds - as described in: A. Lenci and G. Benotto. 2012. Identifying hypernyms in distributional semantic spaces. In *SEM Weeds Precision - as described in: J. Weeds and D. Weir. 2003. A general framework for distributional similarity. In EMNLP. """ # Get the arguments args = docopt( """Compute cosWeeds Precision for a list of (x, y) pairs and save their scores. Usage: cosWeeds.py <testset_file> <dsm_prefix> <output_file> <testset_file> = a file containing term-pairs, labels and relations, each line in the form of x\ty\tlabel\trelation <dsm_prefix> = the prefix for the pkl files for the vector space <output_file> = where to save the results: a tab separated file with x\ty\tlabel\trelation\tscore, where the score is cosWeeds (for y as the hypernym of x). """) testset_file = args['<testset_file>'] dsm_prefix = args['<dsm_prefix>'] output_file = args['<output_file>'] # Load the term-pairs with codecs.open(testset_file) as f_in: test_set = [tuple(line.strip().split('\t')) for line in f_in] # Load the vector space vector_space = load_pkl_files(dsm_prefix) target_index = {w: i for i, w in enumerate(vector_space.id2row)} cooc_mat = vector_space.cooccurrence_matrix # Compute the score for each term with codecs.open(output_file, 'w', 'utf-8') as f_out: for (x, y, label, relation) in test_set: x_index, y_index = target_index.get(x, -1), target_index.get(y, -1) cosWeeds = 0.0 if x_index > -1 and y_index > -1: x_row, y_row = cooc_mat[x_index, :], cooc_mat[y_index, :] score = weeds_prec(x_row, y_row) cosine = vector_space.get_sim(x, y, CosSimilarity()) cosWeeds = math.sqrt(cosine * score) print >> f_out, '\t'.join((x, y, label, '%.5f' % cosWeeds))
def inspect_representations(path_composed_emb, output_path): print('Inspecting representations...') composed_space = Space.build(data=path_composed_emb, format='dm') f = codecs.open(output_path, 'w', 'utf8') word_list = [w for w in composed_space.get_row2id()] for j, w in enumerate(word_list): if j < 1000: neighbours = composed_space.get_neighbours(w, 10, CosSimilarity()) f.write('Neighbours for ' + w + '\n') f.write("\n".join('%s %.6f' % x for x in neighbours)) f.write('\n----------------------------\n') f.close()
def compute_sim(in_file, columns, out_dir, sim_measures, space_files): sim_dict = { "cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity() } if not len(columns) == 2: raise ValueError("Column description unrecognized!") col0 = int(columns[0]) - 1 col1 = int(columns[1]) - 1 try: space = io_utils.load(space_files[0], Space) except TypeError: warn("Not a Space instance in file: %s" % space_files[0]) return space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr]) for sim_measure in sim_measures: print("Computing similarities: %s" % sim_measure) if not sim_measure in sim_dict: warn("Similarity measure:%s not defined" % sim_measure) continue sim = sim_dict[sim_measure] out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) with open(in_file) as in_stream, open(out_file, "w") as out_stream: for line in in_stream: if not line.strip() == "": elems = line.strip().split() word1 = elems[col0] word2 = elems[col1] predicted_sim = space.get_sim(word1, word2, sim, space2) out_stream.write("%s %s\n" % (line.strip(), str(predicted_sim)))
def main(): """ Cosine similarity """ # Get the arguments args = docopt( """Compute cosine for a lis of (x, y) pairs and save their scores. Usage: cosine.py <testset_file> <dsm_prefix> <output_file> <testset_file> = a file containing term-pairs, labels and relations, each line in the form of x\ty\tlabel\trelation <dsm_prefix> = the prefix for the pkl files for the vector space <output_file> = where to save the results: a tab separated file with x\ty\tlabel\trelation\tscore, where the score is cosine (simmetric measure). """) testset_file = args['<testset_file>'] dsm_prefix = args['<dsm_prefix>'] output_file = args['<output_file>'] # Load the term-pairs with codecs.open(testset_file) as f_in: test_set = [tuple(line.strip().split('\t')) for line in f_in] # Load the vector space vector_space = load_pkl_files(dsm_prefix) target_index = {w: i for i, w in enumerate(vector_space.id2row)} # Compute the score for each term with codecs.open(output_file, 'w', 'utf-8') as f_out: for (x, y, label, relation) in test_set: x_index, y_index = target_index.get(x, -1), target_index.get(y, -1) cosine = 0.0 if x_index > -1 and y_index > -1: cosine = vector_space.get_sim(x, y, CosSimilarity()) print >> f_out, '\t'.join((x, y, label, '%.5f' % cosine))
def getThesaurus(word): if isinstance(word, unicode): word = word.encode('utf-8') else: try: word.decode('utf-8') except: raise # find synonyms in chilin for line in open(THES_PATH + 'chilin-zh-TW.csv'): synonyms = line.split() if word in synonyms: break # calculate word similarity word_sim_dict = {} my_space = Space.build(data=THES_PATH + 'sm', rows=THES_PATH + 'words.rows', cols=THES_PATH + 'cols', format='sm') for row in open(THES_PATH + 'words.rows'): word1 = row.strip() sim = my_space.get_sim(word1, word, CosSimilarity()) if sim > .3: word_sim_dict[word1] = sim # rank first those overlapping with chilin synonyms word_sim_list = [] if word_sim_dict.get(word): word_sim_dict.pop(word) for key in word_sim_dict.keys(): if key in synonyms: word_sim_dict.pop(key) word_sim_list += [key] # sort the rest of words d = sorted(word_sim_dict.items(), key=lambda x: x[1], reverse=True) word_sim_list += [word for word, sim in d] word_sim_list = word_sim_list[:9] return word_sim_list
class SentenceVectorKernel(Kernel): ''' classdocs ''' kernel_name = "sentence_vector_kernel" def __init__(self, similarity=None): ''' Constructor ''' if similarity is None: self._similarity = CosSimilarity() else: self._similarity = similarity def dot_product(self, tree1, tree2): assert_type(tree1, SemanticTree) assert_type(tree2, SemanticTree) sentence_vector1 = tree1._root._vector sentence_vector2 = tree2._root._vector if sentence_vector1.norm() == 0.0 or sentence_vector2.norm() == 0.0: return 0.0 else: return self._similarity.get_sim(sentence_vector1, sentence_vector2)
else: for x in right_context_words: left_unison = left_unison.multiply(final_model.get_row(x)) base_unison = left_unison #print "Three" # Create a vector having context words and word to replace. if add: context_word_vector = base_unison + final_model.get_row(word) else: context_word_vector = base_unison.multiply(final_model.get_row(word)) if base_unison is not None else final_model.get_row(word) #print "Four" results = {} cos_sim = CosSimilarity() ############################################################################# # If we simply get the nearest neigbours of the actual context word. ############################################################################# if no_rerank: results = final_model.get_xneighbours(context_word_vector, 10, cos_sim) return (word, map(lambda x: x[0][:-2], results)) ############################################################################# # Get the list of the similar words to the given vector. ############################################################################# antonyms = big_thesaurus.antonyms(word) replacements = [] if thesaurus > 0.0: synonyms = big_thesaurus.replacements(word)
def computeAnalogy(w1,w2,w3): composed_space = sub.compose([(w1,w2, "step1")], space) composed_space2 = add.compose([("step1", w3, "step2")], (composed_space,space)) guess=composed_space2.get_neighbours("step2", 1, CosSimilarity(),space) return guess
########################################################################## from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive from composes.similarity.cos import CosSimilarity import sys pkl=sys.argv[1] base=sys.argv[2] minus=sys.argv[3] plus=sys.argv[4] space = io_utils.load(pkl) # instantiate an additive and subtractive model add = WeightedAdditive(alpha = 1, beta = 1) sub = WeightedAdditive(alpha = 1, beta = -1) #print space.get_neighbours(base, 10, CosSimilarity()) print "Subtracting",minus,"from",base composed_space = sub.compose([(base, minus, "step1")], space) #print composed_space.get_neighbours("step1", 10, CosSimilarity(),space) print "Adding",plus,"..." composed_space2 = add.compose([("step1", plus, "step2")], (composed_space,space)) print composed_space2.get_neighbours("step2", 10, CosSimilarity(),space)
def main(): """ Compute k nearest neighbors for targets. """ # Get the arguments args = docopt("""Compute k nearest neighbors for targets. Usage: knn.py <spacePrefix1> <k> <outPath> [<testset> <co>] <spacePrefix1> = path to pickled space without suffix <testset> = path to file with tab-separated word pairs <co> = column index for targets <k> = parameter k (k nearest neighbors) <outPath> = output path for result file Note: ... """) spacePrefix1 = args['<spacePrefix1>'] testset = args['<testset>'] co = int(args['<co>']) outPath = args['<outPath>'] k = int(args['<k>']) logging.config.dictConfig({'version': 1, 'disable_existing_loggers': True,}) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load spaces space1 = load_pkl_files(spacePrefix1) if testset!=None: with codecs.open(testset, 'r', 'utf8') as f_in: targets = [line.strip().split('\t')[co] for line in f_in] else: # If no test set is provided, compute values for all targets occurring in both spaces targets = [target.decode('utf8') for target in space1.get_row2id()] target2neighbors = {} for i,t1 in enumerate(targets): try: neighbors1 = space1.get_neighbours(t1.encode('utf8'), k, CosSimilarity()) del neighbors1[0] except KeyError: neighbors1 = [('nan',float('nan'))] target2neighbors[t1] = neighbors1 with codecs.open(outPath +'.csv', 'w', 'utf-8') as f_out: for t1 in targets: # Convert cosine similarity to cosine distance, export nearest neighbors print >> f_out, t1+'\t'+' '.join([str((n,1-v)) for (n,v) in target2neighbors[t1]]) logging.info("--- %s seconds ---" % (time.time() - start_time))
def main(): """ Compute local neighborhood distance for target pairs from two vector spaces. """ # Get the arguments args = docopt( """Compute local neighborhood distance for target pairs from two vector spaces. Usage: lnd.py [(-f | -s)] <spacePrefix1> <spacePrefix2> <k> <outPath> [<testset>] <spacePrefix1> = path to pickled space without suffix <spacePrefix2> = path to pickled space without suffix <testset> = path to file with tab-separated word pairs <k> = parameter k (k nearest neighbors) <outPath> = output path for result file Options: -f, --fst write only first target in output file -s, --scd write only second target in output file """) is_fst = args['--fst'] is_scd = args['--scd'] spacePrefix1 = args['<spacePrefix1>'] spacePrefix2 = args['<spacePrefix2>'] testset = args['<testset>'] outPath = args['<outPath>'] k = int(args['<k>']) logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': True, }) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load spaces space1 = load_pkl_files(spacePrefix1) space2 = load_pkl_files(spacePrefix2) if testset != None: # target vectors in first/second column are computed from space1/space2 with codecs.open(testset, 'r', 'utf8') as f_in: targets = [(line.strip().split('\t')[0], line.strip().split('\t')[1]) for line in f_in] else: # If no test set is provided, compute values for all targets occurring in both spaces target_intersection = set([ target.decode('utf8') for target in space1.get_row2id() ]).intersection( [target.decode('utf8') for target in space2.get_row2id()]) targets = zip(target_intersection, target_intersection) scores = {} neighborUnionSizes = {} for i, (t1, t2) in enumerate(targets): # Get nearest neighbors try: neighbors1 = space1.get_neighbours(t1.encode('utf8'), k, CosSimilarity()) neighbors2 = space2.get_neighbours(t2.encode('utf8'), k, CosSimilarity()) except KeyError: scores[(t1, t2)] = 'nan' neighborUnionSizes[(t1, t2)] = 'nan' continue neighborUnion = list( set([ a for (a, b) in neighbors1 + neighbors2 if (a in space1.row2id and a in space2.row2id and not a in [t1.encode('utf8'), t2.encode('utf8')]) ])) simVec1 = [ space1.get_sim(t1.encode('utf8'), n, CosSimilarity()) for n in neighborUnion ] simVec2 = [ space2.get_sim(t2.encode('utf8'), n, CosSimilarity()) for n in neighborUnion ] # Compute cosine distance of vectors distance = spatial.distance.cosine(simVec1, simVec2) scores[(t1, t2)] = distance neighborUnionSizes[(t1, t2)] = len(neighborUnion) with codecs.open(outPath + '.csv', 'w', 'utf-8') as f_out: for (t1, t2) in targets: if is_fst: # output only first target string print >> f_out, '\t'.join((t1, str(float(scores[(t1, t2)])), str(neighborUnionSizes[(t1, t2)]))) elif is_scd: # output only second target string print >> f_out, '\t'.join((t2, str(float(scores[(t1, t2)])), str(neighborUnionSizes[(t1, t2)]))) else: # standard outputs both target strings print >> f_out, '\t'.join( ('%s,%s' % (t1, t2), str(float(scores[(t1, t2)])), str(neighborUnionSizes[(t1, t2)]))) logging.info("--- %s seconds ---" % (time.time() - start_time))
#similarity.py #USAGE: python similarity [space file] [word1] [word2] #EXAMPLE: python kneighbours ~/UkWac/dissect/ANs/ANs.kpl car_n dog_n #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity import sys #load a space my_space = io_utils.load(sys.argv[1]) #print my_space.cooccurrence_matrix #print my_space.id2row #compute similarity between two words in the space print "The similarity of", sys.argv[2], "and", sys.argv[ 3], "is:", my_space.get_sim(sys.argv[2], sys.argv[3], CosSimilarity())
#ex08.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load a space my_space = io_utils.load("./data/out/ex01.pkl") #get the top 2 neighbours of "car" print my_space.get_neighbours("car", 2, CosSimilarity())
#kneighbours.py #USAGE: python kneighbours [space file] [word] [k] #EXAMPLE: python2.7 kneighbours.py ~/UkWac/dissect-data/ANs/out/CORE_SS.ans.ppmi.row.pkl car-n 30 #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity import sys #load a space my_space = io_utils.load(sys.argv[1]) #get the top 2 neighbours of "car" print my_space.get_neighbours(sys.argv[2], int(sys.argv[3]), CosSimilarity())
def __init__(self, lambda_): ''' Constructor ''' self._lambda = lambda_ self._measure = CosSimilarity()
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) lengths = [] found = True for wp in word_pairs: try: v1 = my_space.get_row(wp[0]) v2 = my_space.get_row(wp[1]) except KeyError: #print wp[0],"or",wp[1],"not found" found = False if found: composed_space = add.compose([(wp[0], wp[1], "_composed_")], my_space) neighbours = composed_space.get_neighbours("_composed_", 10, CosSimilarity(), space2=my_space) print wp[0], wp[1] print neighbours density = 0 for n in neighbours: density += n[1] density = density / 10 print "Density", density c = composed_space.get_row("_composed_") print "Norm ", c.norm() cos = composed_space.get_sim("_composed_", wp[1], CosSimilarity(), space2=my_space) print "Cos ", cos
#ex07.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load two spaces my_space = io_utils.load("./data/out/ex01.pkl") my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") print(my_space.id2row) print(my_per_space.id2row) #compute similarity between a word and a phrase in the two spaces print( my_space.get_sim("car", "sports_car", CosSimilarity(), space2=my_per_space))
def __init__(self, lambda_): """ Constructor """ self._lambda = lambda_ self._measure = CosSimilarity()
print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2]) composed_space = comp_model.compose(test_phrases, space) print "Reading similarity test data..." test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1]) gold = io_utils.read_list(test_similarity_file, field=2) print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities print "Scoring lexical function..." print scoring_utils.score(gold, pred, "spearman") print "Training Full Additive composition model..." comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print scoring_utils.score(gold, pred, "spearman") print "Training Weighted Additive composition model..." comp_model = WeightedAdditive() comp_model.train(train_data, space, per_space)
#ex09.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load two spaces my_space = io_utils.load("./data/out/ex01.pkl") my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") print(my_space.id2row) print(my_space.cooccurrence_matrix) print(my_per_space.id2row) print(my_per_space.cooccurrence_matrix) #get the top two neighbours of "car" in a peripheral space print(my_space.get_neighbours("car", 2, CosSimilarity(), space2=my_per_space))
els_for_comp.append(element) return els_for_comp typ_space = create_space(TypDmFile, TypRowsFile) distr_space = create_space(DistrDmFile, DistrRowsFile) #load a space from a pickle file #my_space = io_utils.load("./sharp/lexfunc/lexfunc_Ridge_pract.pkl") #distributional vectors processing distr_space = distr_space.apply(PpmiWeighting()) distr_space = distr_space.apply(Svd(300)) #io_utils.save(distr_space, "./spaces/smooth_phrases_ppmi.pkl") items = items_from_file(itemsFile) els_for_comp = elements_for_composition(items) my_comp = WeightedAdditive(alpha=1, beta=1) distr_space = my_comp.compose(els_for_comp, distr_space) pairs = pairs(items) predicted = distr_space.get_sims(pairs, CosSimilarity()) gold = typ_space.get_sims(pairs, CosSimilarity()) #compute correlations print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")
#ex20.py #------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity #read in a space my_space = io_utils.load("data/out/ex01.pkl") #compute similarities of a list of word pairs fname = "data/in/word_sims.txt" word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) predicted = my_space.get_sims(word_pairs, CosSimilarity()) #compute correlations gold = io_utils.read_list(fname, field=2) print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")
#ex06.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load a space my_space = io_utils.load("./data/out/ex01.pkl") print my_space.cooccurrence_matrix print my_space.id2row #compute similarity between two words in the space print my_space.get_sim("car", "car", CosSimilarity()) print my_space.get_sim("car", "book", CosSimilarity())
#ex16.py #------- from composes.utils import io_utils from composes.composition.lexical_function import LexicalFunction from composes.similarity.cos import CosSimilarity #training data #trying to learn a "good" function train_data = [("good_function", "car", "good_car"), ("good_function", "book", "good_book")] #load argument and phrase space arg_space = io_utils.load("./data/out/ex10.pkl") phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") #train a lexical function model on the data my_comp = LexicalFunction() my_comp.train(train_data, arg_space, phrase_space) #print its parameters print "\nLexical function space:" print my_comp.function_space.id2row cooc_mat = my_comp.function_space.cooccurrence_matrix cooc_mat.reshape(my_comp.function_space.element_shape) print cooc_mat #similarity within the learned functional space print "\nSimilarity between good and good in the function space:" print my_comp.function_space.get_sim("good_function", "good_function", CosSimilarity())
from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity import sys #read in a space my_space = io_utils.load(sys.argv[1]) #compute similarities of a list of word pairs fname = sys.argv[2] word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1, 2]) predicted = [] gold = [] cos = 0 for wp in word_pairs: try: cos = my_space.get_sim(wp[0], wp[1], CosSimilarity()) if cos > 0: #print wp[0],wp[1],cos predicted.append(cos) gold.append(wp[2]) except: print "Couldn't measure cosine..." #compute correlations print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")