class WeightedAdditiveModel(AdditiveModel): weighted_additive = None new_space = None def __init__(self, space, alpha=None, beta=None, no_diff=False): AdditiveModel.__init__(self, space, no_diff=no_diff) self.weighted_additive = WeightedAdditive(alpha=alpha, beta=beta) def fit(self, train_pairs, verbose=False): AdditiveModel.fit(self, train_pairs, verbose=verbose) if verbose: print 'fit: Fitting a weighted additive model on %d pairs' % (len(train_pairs)) # First, we embed the derived vector into the original space (by simply adding a row) vec_space = Space(self.diff_vector, ['pattern_vector'], []) self.new_space = Space.vstack(self.space, vec_space) # class is designed to be run on a dataset with different function words (==patterns). # We use a dummy function word here. train_pairs_ext = [(base, 'pattern_vector', derived) for (base, derived) in train_pairs] self.weighted_additive.train(train_pairs_ext, self.new_space, self.new_space) def predict(self, base, verbose=False): if self.weighted_additive is None: raise NameError('Error: Model has not yet been trained') composed_space = self.weighted_additive.compose([(base, 'pattern_vector', 'derived')], self.new_space) return composed_space.get_row('derived')
def test_train(self): test_cases = [(self.m11, self.m21, self.ph1, 2, 3), (self.m12, self.m22, self.ph2, 2, 3), (self.m11, self.m21, DenseMatrix(np.mat([[0],[0]])), 0, 0), (SparseMatrix(self.m12), SparseMatrix(self.m22), SparseMatrix(self.ph2), 2, 3), (self.m11, DenseMatrix(np.mat([[0],[0]])), self.ph1, 3, 0), (DenseMatrix(np.mat([[0],[0]])), self.m11, self.ph1, 0, 3), (DenseMatrix(np.mat([[1,2,3]])), DenseMatrix(np.mat([[2,4,6]])), DenseMatrix(np.mat([[3,6,9]])), 0.6, 1.2), (DenseMatrix(np.mat([[0],[0]])), DenseMatrix(np.mat([[0],[0]])), DenseMatrix(np.mat([[0],[0]])), 0.0, 0.0) ] id2row_dict = {1:["a"],2:["a", "b"]} train_dict = {1:[("a", "a", "a")],2:[("a", "a", "a"), ("b", "b", "b")]} for m1, m2, ph, expected_alpha, expected_beta in test_cases: model = WeightedAdditive() arg_space1 = Space(m1, id2row_dict[m1.shape[0]],[]) arg_space2 = Space(m2, id2row_dict[m1.shape[0]],[]) ph_space = Space(ph, id2row_dict[m1.shape[0]],[]) train_data = train_dict[m1.shape[0]] #model._train(m1, m2, ph) model.train(train_data, (arg_space1, arg_space2), ph_space) self.assertAlmostEqual(model.alpha, expected_alpha, 8) self.assertAlmostEqual(model.beta, expected_beta, 8)
def test_space_train(self): test_cases = [ ([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, Space(DenseMatrix(np.mat([[12,3],[6,2]])), ["a_b", "a_a"],["f1", "f2"]), 1, 1 ), ([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, Space(DenseMatrix(np.mat([[0,0],[0,0]])), ["a_b", "a_a"],["f1", "f2"]), 0, 0 ), ([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, Space(DenseMatrix(np.mat([[0,0],[0,0]])), ["a_b", "a_a"],[]), 0, 0 ), ([("a", "b", "a_b")], self.space1, Space(DenseMatrix(np.mat([[21,5]])), ["a_b"],[]), 1, 2 ), ([("a", "b", "a_b"), ("bla", "b", "a_b"), ("a", "bla", "a_b")], self.space1, Space(DenseMatrix(np.mat([[21,5]])), ["a_b"],[]), 1, 2 ) ] for in_data, arg_space, phrase_space, alpha, beta in test_cases: model = WeightedAdditive() model.train(in_data, arg_space, phrase_space) self.assertAlmostEqual(model.alpha, alpha, 7) self.assertAlmostEqual(model.beta, beta, 7) comp_space = model.compose(in_data, arg_space) self.assertListEqual(comp_space.id2row, phrase_space.id2row) self.assertListEqual(comp_space.id2column, phrase_space.id2column) self.assertDictEqual(comp_space.row2id, phrase_space.row2id) self.assertDictEqual(comp_space.column2id, phrase_space.column2id) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, phrase_space.cooccurrence_matrix.mat, 8)
def test_weighted_additive(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = WeightedAdditive() m.export(self.prefix + ".add1") m.train([("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".add2")
def test_weighted_additive(self): self.m12 = DenseMatrix(np.mat([[3,1],[9,2]])) self.m22 = DenseMatrix(np.mat([[4,3],[2,1]])) self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]])) self.row = ["a", "b"] self.ft = ["f1","f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft) m = WeightedAdditive() m.export(self.prefix + ".add1") m.train([("a","a","a_a")], self.space1, self.space2) m.export(self.prefix + ".add2")
def create_model(model, alpha, beta, lambda_): #TODO: IMPORTANT here: if alpha, beta of lambda are none model_dict = { "weighted_add": WeightedAdditive, "dilation": Dilation, "mult": Multiplicative } if not model in model_dict: raise ValueError("Invalid model:%s" % model) if model == "weighted_add": model_obj = WeightedAdditive(alpha, beta) elif model == "dilation": model_obj = Dilation(lambda_) else: model_obj = Multiplicative() return model_obj
def test_compose(self): model = WeightedAdditive(2,3) np.testing.assert_array_equal(model._compose(self.m11, self.m21).mat, self.ph1.mat) model = WeightedAdditive() np.testing.assert_array_equal(model._compose(self.m11, self.m21).mat, np.mat([[7/2.],[11/2.]])) model = WeightedAdditive(0.5) np.testing.assert_array_equal(model._compose(self.m11, self.m21).mat, np.mat([[7/2.],[11/2.]]))
els_for_comp.append(element) return els_for_comp typ_space = create_space(TypDmFile, TypRowsFile) distr_space = create_space(DistrDmFile, DistrRowsFile) #load a space from a pickle file #my_space = io_utils.load("./sharp/lexfunc/lexfunc_Ridge_pract.pkl") #distributional vectors processing distr_space = distr_space.apply(PpmiWeighting()) distr_space = distr_space.apply(Svd(300)) #io_utils.save(distr_space, "./spaces/smooth_phrases_ppmi.pkl") items = items_from_file(itemsFile) els_for_comp = elements_for_composition(items) my_comp = WeightedAdditive(alpha=1, beta=1) distr_space = my_comp.compose(els_for_comp, distr_space) pairs = pairs(items) predicted = distr_space.get_sims(pairs, CosSimilarity()) gold = typ_space.get_sims(pairs, CosSimilarity()) #compute correlations print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")
#testAnalogy.py #argv[1]: space pkl file #argv[2]: analogy test file #EXAMPLE: python testAnalogy.py ../../spaces/wikipedia.pkl analogy_dataset.txt #------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity from composes.composition.weighted_additive import WeightedAdditive import sys add = WeightedAdditive(alpha = 1, beta = 1.2) sub = WeightedAdditive(alpha = 1, beta = -1) #read in a space space = io_utils.load(sys.argv[1]) def computeAnalogy(w1,w2,w3): composed_space = sub.compose([(w1,w2, "step1")], space) composed_space2 = add.compose([("step1", w3, "step2")], (composed_space,space)) guess=composed_space2.get_neighbours("step2", 1, CosSimilarity(),space) return guess score=0 #read in test file fname = sys.argv[2] f=open(fname,'r') flines=f.readlines()
# 2) a file with short phrases (2 words, e.g. parliamentary potato) #------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity from composes.composition.weighted_additive import WeightedAdditive from composes.composition.multiplicative import Multiplicative from composes.transformation.scaling.row_normalization import RowNormalization import numpy as np import sys #read in a space my_space = io_utils.load(sys.argv[1]) my_space = my_space.apply(RowNormalization()) add = WeightedAdditive(alpha = 1, beta = 1) mult = Multiplicative() #compute multiplication/addition of a list of word pairs fname = sys.argv[2] word_pairs = io_utils.read_tuple_list(fname, fields=[0,1]) lengths=[] found=True for wp in word_pairs: try: v1=my_space.get_row(wp[0]) v2=my_space.get_row(wp[1]) except KeyError: #print wp[0],"or",wp[1],"not found"
from __future__ import print_function import sys from random import randint from itertools import count from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive from composes.semantic_space.space import Space stacked_space = io_utils.load("gastrovec.ppmi.svd20.pkl") WA = WeightedAdditive(alpha = 1, beta = 1) recipes = {} max_size = 0 with open("../corpus_collection/composition_counts.txt") as f: for line in f: words = line.split() recipes[words[0]] = words[1:] if len(words)-1 > max_size: max_size = len(words)-1 WA = WeightedAdditive(alpha = 1, beta = 1) last_space = None number = count() for size in xrange(max_size,1,-1): relevant = (rec for rec in recipes if len(recipes[rec]) == size) print(size) composition = [] for recipe in relevant: old = recipes[recipe]
#ex10.py #------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive #load a space my_space = io_utils.load("./data/out/ex10.pkl") print my_space.id2row print my_space.cooccurrence_matrix # instantiate a weighted additive model my_comp = WeightedAdditive(alpha = 1, beta = 1) # use the model to compose words in my_space composed_space = my_comp.compose([("good", "book", "good_book"), ("good", "car", "good_car")], my_space) print composed_space.id2row print composed_space.cooccurrence_matrix #save the composed space io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")
print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities print "Scoring lexical function..." print scoring_utils.score(gold, pred, "spearman") print "Training Full Additive composition model..." comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print scoring_utils.score(gold, pred, "spearman") print "Training Weighted Additive composition model..." comp_model = WeightedAdditive() comp_model.train(train_data, space, per_space) print "alpha, beta:", comp_model.alpha, comp_model.beta composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print scoring_utils.score(gold, pred, "spearman") print "Training Dilation composition model..." comp_model = Dilation() comp_model.train(train_data, space, per_space) print "lambda:", comp_model._lambda composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print scoring_utils.score(gold, pred, "spearman") print "Multiplicative composition model..."
########################################################################## from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive from composes.similarity.cos import CosSimilarity import sys pkl=sys.argv[1] base=sys.argv[2] minus=sys.argv[3] plus=sys.argv[4] space = io_utils.load(pkl) # instantiate an additive and subtractive model add = WeightedAdditive(alpha = 1, beta = 1) sub = WeightedAdditive(alpha = 1, beta = -1) #print space.get_neighbours(base, 10, CosSimilarity()) print "Subtracting",minus,"from",base composed_space = sub.compose([(base, minus, "step1")], space) #print composed_space.get_neighbours("step1", 10, CosSimilarity(),space) print "Adding",plus,"..." composed_space2 = add.compose([("step1", plus, "step2")], (composed_space,space)) print composed_space2.get_neighbours("step2", 10, CosSimilarity(),space)
def ins(lst, el): if len(lst) < num: lst.append(el) lst.sort(reverse=True) return else: if el[0] > lst[-1][0]: lst.pop(-1) lst.append(el) lst.sort(reverse=True) stacked = io_utils.load("gastrovec.ppmi.svd20.pkl") recicomp = io_utils.load(recipe_space) WA = WeightedAdditive(alpha = 1, beta = 1) number = count() ingredients = [] print("Enter ingredients, enter when done") while True: ingredient = raw_input("> ").replace(" ","_") if ingredient == "": break if ingredient not in stacked.id2row: print("(not found, skipping)") continue ingredients.append(ingredient) name = "" while True:
#ex13.py #------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive #training data train_data = [("good", "car", "good_car"), ("good", "book", "good_book")] #load an argument space arg_space = io_utils.load("./data/out/ex10.pkl") print arg_space.id2row print arg_space.cooccurrence_matrix #load a phrase space phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") print phrase_space.id2row print phrase_space.cooccurrence_matrix #train a weighted additive model on the data my_comp = WeightedAdditive() my_comp.train(train_data, arg_space, phrase_space) #print its parameters print "alpha:", my_comp.alpha print "beta:", my_comp.beta
#ex11.py #------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive # instantiate a weighted additive model my_comp = WeightedAdditive(alpha = 1, beta = 1) #save it to pickle io_utils.save(my_comp, "./data/out/model01.pkl") #print its parameters my_comp.export("./data/out/model01.params")
def __init__(self, space, alpha=None, beta=None, no_diff=False): AdditiveModel.__init__(self, space, no_diff=no_diff) self.weighted_additive = WeightedAdditive(alpha=alpha, beta=beta)
# 2) a file with short phrases (2 words, e.g. parliamentary potato) #------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity from composes.composition.weighted_additive import WeightedAdditive from composes.composition.multiplicative import Multiplicative from composes.transformation.scaling.row_normalization import RowNormalization import numpy as np import sys #read in a space my_space = io_utils.load(sys.argv[1]) my_space = my_space.apply(RowNormalization()) add = WeightedAdditive(alpha=1, beta=1) mult = Multiplicative() #compute multiplication/addition of a list of word pairs fname = sys.argv[2] word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) lengths = [] found = True for wp in word_pairs: try: v1 = my_space.get_row(wp[0]) v2 = my_space.get_row(wp[1]) except KeyError: #print wp[0],"or",wp[1],"not found" found = False
# Just exposing the possibility to learn peripheral space if we have a corpus where phrases etc are marked # as one token. Then we can use the word2vec_bin_to_DISSECT_dm convertor to generate a similar dm # print "Creating peripheral space.." # per_space = PeripheralSpace.build(space, # format = "dm", # data ="SOME_PATH_FOR_A_WORD_TO_VEC_PERIPHERAL_SPACE_DATA" # ) # Debug # print space.cooccurrence_matrix # print space.id2row # instantiate a weighted additive model my_comp = WeightedAdditive(alpha = 1, beta = 1) # use the model to compose words in my_space composed_space = my_comp.compose([("good", "book", "good_book"), ("good", "car", "good_car")], space) print composed_space.id2row print composed_space.cooccurrence_matrix print composed_space.get_sims([("good_car","good_book")], CosSimilarity()) # Similarity metric #=============================================================================================================== print "="*80 #=============================================================================================================== ##Training Models
#------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive #training data train_data = [("good", "car", "good_car"), ("good", "book", "good_book") ] #load an argument space arg_space = io_utils.load("./data/out/ex10.pkl") print arg_space.id2row print arg_space.cooccurrence_matrix #load a phrase space phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") print phrase_space.id2row print phrase_space.cooccurrence_matrix #train a weighted additive model on the data my_comp = WeightedAdditive() my_comp.train(train_data, arg_space, phrase_space) #print its parameters print "alpha:", my_comp.alpha print "beta:", my_comp.beta