Beispiel #1
0
class LexfunModel(Model):

    lexfun = None

    def __init__(self, space, learner='LeastSquares', intercept=True, param=None):
        # super(LexfunModel, self).__init__(space)
        Model.__init__(self, space)
        if learner == 'Ridge':
            # If param==None, generalized CV will be performed within standard param range
            learner = RidgeRegressionLearner(intercept=intercept, param=param)
        elif learner == 'LeastSquares':
            learner = LstsqRegressionLearner()
        else:
            raise NameError("No such learner: %s" % learner)
        self.lexfun = LexicalFunction(learner=learner)

    def fit(self, train_pairs, verbose=False):
        if len(train_pairs) == 0:
            raise NameError('Error: Train set is empty')
        else:
            if verbose:
                print 'fit: Fitting a lexfun model on %d pairs' % (len(train_pairs))
            # LexicalFunction class is designed to be run on a dataset with different function words (==patterns).
            # We use a dummy function word here.
            train_pairs_ext = [('dummy', base, derived) for (base, derived) in train_pairs]
            self.lexfun.train(train_pairs_ext, self.space, self.space)

    def predict(self, base, verbose=False):
        if self.lexfun is None:
            raise NameError('Error: Model has not yet been trained')
        composed_space = self.lexfun.compose([('dummy', base, 'derived')], self.space)
        return composed_space.get_row('derived')
    def test_min_samples1(self):
        #TODO test a1_car twice in the phrase list
        train_data = [("bla3", "man", "a1_car"),
                      ("a1", "car", "a1_car"),
                      ("bla2", "man", "a1_car"),
                      ("a1", "man", "a1_man"),
                      ("bla1", "man", "a1_car")
        ]
        #model with train and then compose
        learner_ = LstsqRegressionLearner(intercept=True)
        model = LexicalFunction(learner=learner_, min_samples=2)

        model.train(train_data, self.n_space, self.an_space)

        new_space = model.function_space

        np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
                                             np.mat([[0.66666667, 0.33333333,
                                                      -0.33333333, 0.33333333,
                                                      0.66666667, 0.33333333]]),
                                             7)

        self.assertTupleEqual(new_space.element_shape, (2, 3))
        self.assertListEqual(new_space.id2row, ["a1"])
        self.assertListEqual(new_space.id2column, [])
    def get_vector(self, df):
        # 3. use the trained models to compose new SVO sentences
        # 3.1 use the V model to create new VO combinations
        data = (str(df[1]), str(df[2]), str(df[1:]))
        # ("take/V", "place/N", "take/V_place/N")
        vo_composed_space = self.v_model.compose([data], self.n_space)
        # todo how do we get VO vectors? these are (100x100)+100 dimensional (intercept).
        # todo do we allow document features of different dimensionality
        # vo_composed_space.cooccurrence_matrix.mat

        # 3.2 the new VO combinations will be used as functions:
        # load the new VO combinations obtained through composition into
        # a new composition model
        expanded_vo_model = LexicalFunction(function_space=vo_composed_space,
                                            intercept=self.v_model._has_intercept)

        # 3.3 use the new VO combinations by composing them with subject nouns
        # in order to obtain new SVO sentences
        data = (str(df[1:]), str(df[0]), str(df))
        svo_composed_space = expanded_vo_model.compose([data], self.n_space)

        # print the composed spaces:
        # logging.info("SVO composed space:")
        # logging.info(svo_composed_space.id2row)
        # logging.info(svo_composed_space.cooccurrence_matrix)

        # get vectors out. these are 100-dimensional
        return svo_composed_space.cooccurrence_matrix.mat
    def test_min_samples1(self):

        #TODO test a1_car twice in the phrase list
        train_data = [("bla3", "man", "a1_car"),
                      ("a1", "car", "a1_car"),
                      ("bla2", "man", "a1_car"),
                      ("a1", "man", "a1_man"),
                      ("bla1", "man", "a1_car")
                      ]
        #model with train and then compose
        learner_ = LstsqRegressionLearner(intercept=True)
        model = LexicalFunction(learner=learner_)
        model._MIN_SAMPLES = 2

        model.train(train_data, self.n_space, self.an_space)

        new_space = model.function_space

        np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
                                             np.mat([[0.66666667,0.33333333,
                                                      -0.33333333,0.33333333,
                                                      0.66666667,0.33333333]]),
                                              7)

        self.assertTupleEqual(new_space.element_shape, (2,3))
        self.assertListEqual(new_space.id2row, ["a1"])
        self.assertListEqual(new_space.id2column, [])
def train_one_space(core_space, per_space, func_pos, number_of_lambdas):
    param_range = np.logspace(-1,1,number_of_lambdas)
    training_list = get_training_list(per_space, 1, func_pos)
    per_space = per_space.apply(RowNormalization())
    composition_model = LexicalFunction(
                        learner=RidgeRegressionLearner(param_range=param_range,
                                                       intercept=False))
    composition_model.train(training_list, core_space, per_space)
    return composition_model.function_space
    def test_simple_train_compose_intercept(self):

        #TODO test a1_car twice in the phrase list
        train_data = [("a1", "car", "a1_car"),
                      ("a1", "man", "a1_man"),
                      ]
        #model with train and then compose
        learner_ = LstsqRegressionLearner(intercept=True)
        model = LexicalFunction(learner=learner_)
        model._MIN_SAMPLES = 1

        model.train(train_data, self.n_space, self.an_space)

        new_space = model.function_space

        np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
                                             np.mat([[0.66666667,0.33333333,
                                                      -0.33333333,0.33333333,
                                                      0.66666667,0.33333333]]),
                                              7)

        self.assertTupleEqual(new_space.element_shape, (2,3))
        self.assertListEqual(new_space.id2row, ["a1"])
        self.assertListEqual(new_space.id2column, [])

        comp_space = model.compose(train_data, self.n_space)

        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                self.an_space.cooccurrence_matrix.mat, 10
                                )

        self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"])
        self.assertListEqual(comp_space.id2column, self.ft)

        #new model, without training
        model2 = LexicalFunction(function_space=new_space, intercept=True)
        model2._MIN_SAMPLES = 1
        comp_space = model2.compose(train_data, self.n_space)

        self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"])
        self.assertListEqual(comp_space.id2column, [])
        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             self.n_space.cooccurrence_matrix.mat,
                                             8)
        #recursive application
        comp_space2 = model2.compose([("a1", "a1_car", "a1_a1_car"),
                                      ("a1", "a1_man", "a1_a1_man")],
                                     comp_space)

        self.assertListEqual(comp_space2.id2row, ["a1_a1_car", "a1_a1_man"])
        self.assertListEqual(comp_space.id2column, [])

        np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat,
                                             self.n_space.cooccurrence_matrix.mat,
                                             8)
        self.assertEqual(comp_space.element_shape, (2,))
        self.assertEqual(comp_space2.element_shape, (2,))
    def test_min_samples2(self):
        train_data = [("a1", "man", "bla"),
                      ("a1", "car", "a1_car"),
                      ("a1", "man", "bla"),
                      ("a1", "man", "a1_man"),
                      ("a1", "bla", "a1_man"),
                      ("a1", "man", "bla")
                      ]

        model = LexicalFunction()
        model._MIN_SAMPLES = 5

        self.assertRaises(ValueError, model.train, train_data, self.n_space, self.an_space)
    def test_simple_train_compose_intercept(self):
        #TODO test a1_car twice in the phrase list
        train_data = [("a1", "car", "a1_car"),
                      ("a1", "man", "a1_man"),
        ]
        #model with train and then compose
        learner_ = LstsqRegressionLearner(intercept=True)
        model = LexicalFunction(learner=learner_)

        model.train(train_data, self.n_space, self.an_space)

        new_space = model.function_space

        np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
                                             np.mat([[0.66666667, 0.33333333,
                                                      -0.33333333, 0.33333333,
                                                      0.66666667, 0.33333333]]),
                                             7)

        self.assertTupleEqual(new_space.element_shape, (2, 3))
        self.assertListEqual(new_space.id2row, ["a1"])
        self.assertListEqual(new_space.id2column, [])

        comp_space = model.compose(train_data, self.n_space)

        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             self.an_space.cooccurrence_matrix.mat, 10
        )

        self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"])
        self.assertListEqual(comp_space.id2column, self.ft)

        #new model, without training
        model2 = LexicalFunction(function_space=new_space, intercept=True)
        comp_space = model2.compose(train_data, self.n_space)

        self.assertListEqual(comp_space.id2row, ["a1_car", "a1_man"])
        self.assertListEqual(comp_space.id2column, [])
        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             self.n_space.cooccurrence_matrix.mat,
                                             8)
        #recursive application
        comp_space2 = model2.compose([("a1", "a1_car", "a1_a1_car"),
                                      ("a1", "a1_man", "a1_a1_man")],
                                     comp_space)

        self.assertListEqual(comp_space2.id2row, ["a1_a1_car", "a1_a1_man"])
        self.assertListEqual(comp_space.id2column, [])

        np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat,
                                             self.n_space.cooccurrence_matrix.mat,
                                             8)
        self.assertEqual(comp_space.element_shape, (2,))
        self.assertEqual(comp_space2.element_shape, (2,))
def predict_using_TENSOR ( compound, TENSOR_matrix, unigram_space ) :
	
	adj = compound.split('_')[0]
	noun = compound.split('_')[1]
			
	composed_space_1 = TENSOR_matrix.compose([("tens_adj", adj, "predicted_ADJ_"+adj)], unigram_space )
	# eg ( "tens_adj", "good", "predicted_ADJ_good") 
	#tens_adj -> Tensor matrix , good -> unigram, predicted_ADJ_good -> to compute ( using  tens_adj * good )
	
	#print composed_space_1.id2row
	expanded_model = LexicalFunction(function_space=composed_space_1,
        intercept=TENSOR_matrix._has_intercept)

	
	composed_space_2 = expanded_model.compose([("predicted_ADJ_"+adj, noun, compound)], unigram_space )
	# eg ( "predicted_ADJ_good", "boy" , "good_boy" ) 
	#predicted_ADJ_good -> ADJ_good matrix computed above, boy -> unigram, good_boy -> to compute ( predicted_ADJ_good * boy )
		
	return composed_space_2
Beispiel #10
0
 def __init__(self, space, learner='LeastSquares', intercept=True, param=None):
     # super(LexfunModel, self).__init__(space)
     Model.__init__(self, space)
     if learner == 'Ridge':
         # If param==None, generalized CV will be performed within standard param range
         learner = RidgeRegressionLearner(intercept=intercept, param=param)
     elif learner == 'LeastSquares':
         learner = LstsqRegressionLearner()
     else:
         raise NameError("No such learner: %s" % learner)
     self.lexfun = LexicalFunction(learner=learner)
    def test_min_samples2(self):
        train_data = [("a1", "man", "bla"),
                      ("a1", "car", "a1_car"),
                      ("a1", "man", "bla"),
                      ("a1", "man", "a1_man"),
                      ("a1", "bla", "a1_man"),
                      ("a1", "man", "bla")
        ]

        model = LexicalFunction(min_samples=5)
        self.assertRaises(ValueError, model.train, train_data, self.n_space, self.an_space)
def learn_ADJ_matrices (  ) :

	bigram_space = load_space(args.function[2])

	train_data=[]
	
	adj_list = extract_adj(bigram_space)
	
        for bigram in bigram_space.id2row  :
	    	pair = bigram.split('_')
            	if( pair[0] in adj_list ) :
			
			train_data.append(("ADJ"+"_"+pair[0], pair[1], bigram)) 
			# eg ( "ADJ_good", boy, good_boy ) , where "ADJ_good" -> matrix to learn, boy -> unigram , good_boy -> bigram
			        	 
    
        my_comp=LexicalFunction()
        my_comp.train(train_data, unigram_space, bigram_space)
	#unigram_space -> for "boy" , bigram_space -> for "good_boy"

        save_space(my_comp, "ADJ_matrices", "matrices")
def compose_space_TENSOR (  ) :

	bigram_space = load_space(args.function[2])
	TENSOR_matrix = load_space(args.function[3])
	
	predicted_ADJs = []
	predicted_bigrams = []
	adj_list = extract_adj(bigram_space)

	for adj in adj_list :
		predicted_ADJs.append(("tens_adj", adj, "predicted_ADJ_"+adj) ) 
		# eg ( "tens_adj", "good", "predicted_ADJ_good") 
		#tens_adj -> Tensor matrix , good -> unigram, predicted_ADJ_good -> to compute ( using  tens_adj * good )

	# Obtain the ADJ matrices using => TENSOR * adj
	composed_space_1 = TENSOR_matrix.compose(predicted_ADJs, unigram_space )

	expanded_model = LexicalFunction(function_space=composed_space_1,
        intercept=TENSOR_matrix._has_intercept)
		
	for bigram in bigram_space.id2row :
		
		adj = bigram.split('_')[0]
		noun = bigram.split('_')[1]
		
		if( not adj in adj_list or noun not in unigram_space.id2row ) :
			continue
	
		predicted_bigrams.append(("predicted_ADJ_"+adj, noun, "predicted_"+bigram) )
		# eg ( "predicted_ADJ_good", "boy" , "predict_good_boy" ) 
		#predicted_ADJ_good -> ADJ_good matrix computed above, boy -> unigram, predicted_good_boy -> to compute (predicted_ADJ_good * boy )
	

	# Predicted composition =  predicted_ADJ * noun  ( where predicted_ADJ = TENSOR * adj )
	composed_space_2 = expanded_model.compose(predicted_bigrams, unigram_space ) 
	
	print "Number of elements in the space : ", len(composed_space_2.id2row)
	save_space(composed_space_2, "composed_space_TENSOR" , "composed_space")	
    def test_train_intercept(self):
        a1_mat = DenseMatrix(np.mat([[3, 4], [5, 6]]))
        a2_mat = DenseMatrix(np.mat([[1, 2], [3, 4]]))

        train_data = [("a1", "man", "a1_man"),
                      ("a2", "car", "a2_car"),
                      ("a1", "boy", "a1_boy"),
                      ("a2", "boy", "a2_boy")
        ]

        n_mat = DenseMatrix(np.mat([[13, 21], [3, 4], [5, 6]]))
        n_space = Space(n_mat, ["man", "car", "boy"], self.ft)

        an1_mat = (a1_mat * n_mat.transpose()).transpose()
        an2_mat = (a2_mat * n_mat.transpose()).transpose()
        an_mat = an1_mat.vstack(an2_mat)

        an_space = Space(an_mat, ["a1_man", "a1_car", "a1_boy", "a2_man", "a2_car", "a2_boy"], self.ft)

        #test train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model.train(train_data, n_space, an_space)
        a_space = model.function_space

        a1_mat.reshape((1, 4))
        #np.testing.assert_array_almost_equal(a1_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[0])

        a2_mat.reshape((1, 4))
        #np.testing.assert_array_almost_equal(a2_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[1])

        self.assertListEqual(a_space.id2row, ["a1", "a2"])
        self.assertTupleEqual(a_space.element_shape, (2, 3))

        #test compose
        a1_mat = DenseMatrix(np.mat([[3, 4, 5, 6]]))
        a2_mat = DenseMatrix(np.mat([[1, 2, 3, 4]]))
        a_mat = a_space.cooccurrence_matrix

        a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2, 3))
        model = LexicalFunction(function_space=a_space, intercept=True)
        comp_space = model.compose(train_data, n_space)

        self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"])
        self.assertListEqual(comp_space.id2column, [])

        self.assertEqual(comp_space.element_shape, (2,))

        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             an_mat[[0, 4, 2, 5]].mat, 8)
Beispiel #15
0
    def test_lexical_function(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = LexicalFunction()
        m._MIN_SAMPLES = 1
        self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1")
        m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1,
                self.space2)
        m.export(self.prefix + ".lf2")
 def test_lexical_function(self):
     
     self.m12 = DenseMatrix(np.mat([[3,1],[9,2]]))
     self.m22 = DenseMatrix(np.mat([[4,3],[2,1]]))
     self.ph2 = DenseMatrix(np.mat([[18,11],[24,7]]))
     self.row = ["a", "b"]
     self.ft = ["f1","f2"]
     self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
     self.space2 = Space(DenseMatrix(self.ph2), ["a_a","a_b"], self.ft)
     m = LexicalFunction()
     m._MIN_SAMPLES = 1
     self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1")
     m.train([("a","b","a_b"),("a","a","a_a")], self.space1, self.space2)
     m.export(self.prefix + ".lf2")
def learn_TENSOR_matrix (  ) :

	bigram_space = load_space(args.function[2])
	my_comp_list = []
	id2row_list = []
	adj_list = extract_adj(bigram_space)

	for adj in adj_list :        
        	
           	train_data=[]		

        	for bigram in bigram_space.id2row :

	    		pair = bigram.split('_')
            		if( not pair[0] == adj ) :
				continue
	    		train_data.append(("ADJ"+"_"+adj, pair[1], bigram))
			# eg ( "ADJ_good", "boy", "good_boy"), where "ADJ_good" -> matrix to learn, boy -> unigram , good_boy -> bigram
				

		my_comp=LexicalFunction()  # 1)

		#Learn ADJ matrix for each adjective
        	my_comp.train(train_data, unigram_space, bigram_space)
        	my_comp_list.append(my_comp.function_space.cooccurrence_matrix)
        	id2row_list.append(my_comp.function_space.id2row)

        my_mat_id2row=id2row_list.pop()
	my_mat_space=Space(my_comp_list.pop(),my_mat_id2row,[])

	#Create a new space using the ADJ matrices created
	for i in range(len(id2row_list)):
    		my_mat_id2row.extend(id2row_list[i])
    		my_mat_space=Space(my_mat_space.cooccurrence_matrix.vstack(my_comp_list[i]),my_mat_id2row,[])
    		my_mat_space._element_shape = my_comp.function_space.element_shape

	#Use the ADJ matrices space to learn the tensor matrix
	train_data=[('tens_adj',adj,"ADJ"+"_"+adj) for adj in adj_list] 
        # eg ( "tens_adj", good, ADJ_good ) 
        #where "tens_adj" -> tensor matrix to learn, good -> unigram , ADJ_good -> adjective matrix learnt by 'my_comp' in 1)


	my_tens_adj=LexicalFunction()
	my_tens_adj.train(train_data, unigram_space, my_mat_space)
	# unigram_space -> for "good" , my_mat_space -> for "ADJ_good"

	save_space(my_tens_adj, "TENSOR_matrix", "matrices")
Beispiel #18
0
print "Applying SVD..."
space = space.apply(Svd(100))

print "Creating peripheral space.."
per_space = PeripheralSpace.build(space,
                                  data=data_path + "per.raw.SV.sm",
                                  cols=data_path + "per.raw.SV.cols",
                                  format="sm")

#reading in train data
train_data_file = data_path + "ML08_SV_train.txt"
train_data = io_utils.read_tuple_list(train_data_file, fields=[0, 1, 2])

print "Training Lexical Function composition model..."
comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)

print "Composing phrases..."
test_phrases_file = data_path + "ML08nvs_test.txt"
test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2])
composed_space = comp_model.compose(test_phrases, space)

print "Reading similarity test data..."
test_similarity_file = data_path + "ML08data_new.txt"
test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1])
gold = io_utils.read_list(test_similarity_file, field=2)

print "Computing similarity with lexical function..."
pred = composed_space.get_sims(test_pairs, CosSimilarity())
    def test_3d(self):

        # setting up
        v_mat = DenseMatrix(np.mat([[0,0,1,1,2,2,3,3],#hate
                                    [0,1,2,4,5,6,8,9]])) #love


        vo11_mat = DenseMatrix(np.mat([[0,11],[22,33]])) #hate boy
        vo12_mat = DenseMatrix(np.mat([[0,7],[14,21]])) #hate man
        vo21_mat = DenseMatrix(np.mat([[6,34],[61,94]])) #love boy
        vo22_mat = DenseMatrix(np.mat([[2,10],[17,26]])) #love car

        train_vo_data = [("hate_boy", "man", "man_hate_boy"),
                      ("hate_man", "man", "man_hate_man"),
                      ("hate_boy", "boy", "boy_hate_boy"),
                      ("hate_man", "boy", "boy_hate_man"),
                      ("love_car", "boy", "boy_love_car"),
                      ("love_boy", "man", "man_love_boy"),
                      ("love_boy", "boy", "boy_love_boy"),
                      ("love_car", "man", "man_love_car")
                      ]

        # if do not find a phrase
        # what to do?
        train_v_data = [("love", "boy", "love_boy"),
                        ("hate", "man", "hate_man"),
                        ("hate", "boy", "hate_boy"),
                        ("love", "car", "love_car")]


        sentences = ["man_hate_boy", "car_hate_boy", "boy_hate_boy",
                     "man_hate_man", "car_hate_man", "boy_hate_man",
                     "man_love_boy", "car_love_boy", "boy_love_boy",
                     "man_love_car", "car_love_car", "boy_love_car" ]
        n_mat = DenseMatrix(np.mat([[3,4],[1,2],[5,6]]))


        n_space = Space(n_mat, ["man", "car", "boy"], self.ft)

        s1_mat = (vo11_mat * n_mat.transpose()).transpose()
        s2_mat = (vo12_mat * n_mat.transpose()).transpose()
        s3_mat = (vo21_mat * n_mat.transpose()).transpose()
        s4_mat = (vo22_mat * n_mat.transpose()).transpose()

        s_mat = vo11_mat.nary_vstack([s1_mat,s2_mat,s3_mat,s4_mat])
        s_space = Space(s_mat, sentences, self.ft)

        #test train 2d
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=False))
        model._MIN_SAMPLES = 1
        model.train(train_vo_data, n_space, s_space)
        vo_space = model.function_space

        self.assertListEqual(vo_space.id2row, ["hate_boy", "hate_man","love_boy", "love_car"])
        self.assertTupleEqual(vo_space.element_shape, (2,2))
        vo11_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo11_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[0])
        vo12_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo12_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[1])
        vo21_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo21_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[2])
        vo22_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo22_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[3])

        # test train 3d
        model2 = LexicalFunction(learner=LstsqRegressionLearner(intercept=False))
        model2._MIN_SAMPLES = 1
        model2.train(train_v_data, n_space, vo_space)
        v_space = model2.function_space
        np.testing.assert_array_almost_equal(v_mat.mat,
                                             v_space.cooccurrence_matrix.mat)
        self.assertListEqual(v_space.id2row, ["hate","love"])
        self.assertTupleEqual(v_space.element_shape, (2,2,2))

        # test compose 3d
        vo_space2 = model2.compose(train_v_data, n_space)
        id2row1 = list(vo_space.id2row)
        id2row2 = list(vo_space2.id2row)
        id2row2.sort()
        self.assertListEqual(id2row1, id2row2)
        row_list = vo_space.id2row
        vo_rows1 = vo_space.get_rows(row_list)
        vo_rows2 = vo_space2.get_rows(row_list)
        np.testing.assert_array_almost_equal(vo_rows1.mat, vo_rows2.mat,7)
        self.assertTupleEqual(vo_space.element_shape, vo_space2.element_shape)
    def test_simple_3d_intercept(self):

        train_data1 = [("drive_car", "I", "I_drive_car"),
                       ("read_man", "You", "You_read_man"),
                       ("read_man", "I", "I_read_man"),
                       ("drive_car", "You", "You_drive_car"),
                       ("drive_man", "You", "You_drive_man"),
                       ("drive_man", "I", "I_drive_man")
                       ]

        train_data2 = [("drive", "car", "drive_car"),
                       ("drive", "man", "drive_man"),
                       ]

        n_mat = DenseMatrix(np.mat([[1,2],[3,4],[5,6],[7,8]]))
        svo_mat = DenseMatrix(np.mat([[1,2],[3,4],[1,2],[3,4],[3,4],[1,2]]))

        n_space = Space(n_mat,["I", "You", "man", "car"],[])
        svo_space = Space(svo_mat,["I_drive_car","You_read_man",
                                 "I_read_man", "You_drive_car",
                                 "You_drive_man", "I_drive_man"],["f1","f2"])

        #test first stage train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model._MIN_SAMPLES = 1
        model.train(train_data1, n_space, svo_space)
        vo_space = model.function_space

        np.testing.assert_array_almost_equal(vo_space.cooccurrence_matrix.mat,
                                            np.mat([[0.6666,0.3333,-0.3333,
                                                     0.3333,0.6666,0.3333],
                                                    [0.6666,0.3333,-0.3333,
                                                     0.3333,0.6666,0.3333],
                                                    [0.6666,0.3333,-0.3333,
                                                     0.3333,0.6666,0.3333]]),
                                             4)

        self.assertTupleEqual(vo_space.element_shape, (2,3))
        self.assertListEqual(vo_space.id2row, ["drive_car","drive_man","read_man"])
        self.assertListEqual(vo_space.id2column, [])

        #test first stage compose
        comp_space = model.compose([train_data1[0]], n_space)
        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             np.mat([[1,2]]), 8)

        self.assertTupleEqual(comp_space.element_shape, (2,))
        self.assertListEqual(comp_space.id2row, ["I_drive_car"])
        self.assertListEqual(comp_space.id2column, ["f1","f2"])

        #test second stage train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model._MIN_SAMPLES = 1
        model.train(train_data2, n_space, vo_space)
        v_space = model.function_space

        np.testing.assert_array_almost_equal(v_space.cooccurrence_matrix.mat,
                                             np.mat([[-0.2222,0.2222,0.4444,
                                                      -0.1111,0.1111,0.2222,
                                                       0.1111,-0.1111,-0.2222,
                                                       -0.1111,0.1111,0.2222,
                                                       -0.2222,0.2222,0.4444,
                                                       -0.1111,0.1111,0.2222]]),
                                              4)

        self.assertTupleEqual(v_space.element_shape, (2,3,3))
        self.assertListEqual(v_space.id2row, ["drive"])
        self.assertListEqual(v_space.id2column, [])

        #test compose1
        comp_space = model.compose([train_data2[0]], n_space)
        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             np.mat([[0.6666,0.3333,-0.3333,
                                                     0.3333,0.6666,0.3333]]), 4)

        self.assertTupleEqual(comp_space.element_shape, (2,3))
        self.assertListEqual(comp_space.id2row, ["drive_car"])
        self.assertListEqual(comp_space.id2column, [])


        #test compose2
        model2 = LexicalFunction(function_space=comp_space, intercept=True)
        model2._MIN_SAMPLES = 1
        comp_space2 = model2.compose([train_data1[0]], n_space)
        np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat,
                                             np.mat([[1,2]]), 8)

        self.assertTupleEqual(comp_space2.element_shape, (2,))
        self.assertListEqual(comp_space2.id2row, ["I_drive_car"])
        self.assertListEqual(comp_space2.id2column, [])

        #recursive application, write a wrapper around it!!!
        comp_space2 = model2.compose([("drive_car", "I", "I_drive_car")], n_space)
        np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat,
                                             np.mat([[1,2]]), 8)

        self.assertTupleEqual(comp_space2.element_shape, (2,))
        self.assertListEqual(comp_space2.id2row, ["I_drive_car"])
        self.assertListEqual(comp_space2.id2column, [])
    def test_simple_3d_intercept(self):
        train_data1 = [("drive_car", "I", "I_drive_car"),
                       ("read_man", "You", "You_read_man"),
                       ("read_man", "I", "I_read_man"),
                       ("drive_car", "You", "You_drive_car"),
                       ("drive_man", "You", "You_drive_man"),
                       ("drive_man", "I", "I_drive_man")
        ]

        train_data2 = [("drive", "car", "drive_car"),
                       ("drive", "man", "drive_man"),
        ]

        n_mat = DenseMatrix(np.mat([[1, 2], [3, 4], [5, 6], [7, 8]]))
        svo_mat = DenseMatrix(np.mat([[1, 2], [3, 4], [1, 2], [3, 4], [3, 4], [1, 2]]))

        n_space = Space(n_mat, ["I", "You", "man", "car"], [])
        svo_space = Space(svo_mat, ["I_drive_car", "You_read_man",
                                    "I_read_man", "You_drive_car",
                                    "You_drive_man", "I_drive_man"], ["f1", "f2"])

        #test first stage train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model.train(train_data1, n_space, svo_space)
        vo_space = model.function_space

        np.testing.assert_array_almost_equal(vo_space.cooccurrence_matrix.mat,
                                             np.mat([[0.6666, 0.3333, -0.3333,
                                                      0.3333, 0.6666, 0.3333],
                                                     [0.6666, 0.3333, -0.3333,
                                                      0.3333, 0.6666, 0.3333],
                                                     [0.6666, 0.3333, -0.3333,
                                                      0.3333, 0.6666, 0.3333]]),
                                             4)

        self.assertTupleEqual(vo_space.element_shape, (2, 3))
        self.assertListEqual(vo_space.id2row, ["drive_car", "drive_man", "read_man"])
        self.assertListEqual(vo_space.id2column, [])

        #test first stage compose
        comp_space = model.compose([train_data1[0]], n_space)
        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             np.mat([[1, 2]]), 8)

        self.assertTupleEqual(comp_space.element_shape, (2,))
        self.assertListEqual(comp_space.id2row, ["I_drive_car"])
        self.assertListEqual(comp_space.id2column, ["f1", "f2"])

        #test second stage train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model.train(train_data2, n_space, vo_space)
        v_space = model.function_space

        np.testing.assert_array_almost_equal(v_space.cooccurrence_matrix.mat,
                                             np.mat([[-0.2222, 0.2222, 0.4444,
                                                      -0.1111, 0.1111, 0.2222,
                                                      0.1111, -0.1111, -0.2222,
                                                      -0.1111, 0.1111, 0.2222,
                                                      -0.2222, 0.2222, 0.4444,
                                                      -0.1111, 0.1111, 0.2222]]),
                                             4)

        self.assertTupleEqual(v_space.element_shape, (2, 3, 3))
        self.assertListEqual(v_space.id2row, ["drive"])
        self.assertListEqual(v_space.id2column, [])

        #test compose1
        comp_space = model.compose([train_data2[0]], n_space)
        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             np.mat([[0.6666, 0.3333, -0.3333,
                                                      0.3333, 0.6666, 0.3333]]), 4)

        self.assertTupleEqual(comp_space.element_shape, (2, 3))
        self.assertListEqual(comp_space.id2row, ["drive_car"])
        self.assertListEqual(comp_space.id2column, [])


        #test compose2
        model2 = LexicalFunction(function_space=comp_space, intercept=True)
        comp_space2 = model2.compose([train_data1[0]], n_space)
        np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat,
                                             np.mat([[1, 2]]), 8)

        self.assertTupleEqual(comp_space2.element_shape, (2,))
        self.assertListEqual(comp_space2.id2row, ["I_drive_car"])
        self.assertListEqual(comp_space2.id2column, [])

        #recursive application, write a wrapper around it!!!
        comp_space2 = model2.compose([("drive_car", "I", "I_drive_car")], n_space)
        np.testing.assert_array_almost_equal(comp_space2.cooccurrence_matrix.mat,
                                             np.mat([[1, 2]]), 8)

        self.assertTupleEqual(comp_space2.element_shape, (2,))
        self.assertListEqual(comp_space2.id2row, ["I_drive_car"])
        self.assertListEqual(comp_space2.id2column, [])
    def test_3d(self):
        # setting up
        v_mat = DenseMatrix(np.mat([[0, 0, 1, 1, 2, 2, 3, 3], #hate
                                    [0, 1, 2, 4, 5, 6, 8, 9]])) #love

        vo11_mat = DenseMatrix(np.mat([[0, 11], [22, 33]])) #hate boy
        vo12_mat = DenseMatrix(np.mat([[0, 7], [14, 21]])) #hate man
        vo21_mat = DenseMatrix(np.mat([[6, 34], [61, 94]])) #love boy
        vo22_mat = DenseMatrix(np.mat([[2, 10], [17, 26]])) #love car

        train_vo_data = [("hate_boy", "man", "man_hate_boy"),
                         ("hate_man", "man", "man_hate_man"),
                         ("hate_boy", "boy", "boy_hate_boy"),
                         ("hate_man", "boy", "boy_hate_man"),
                         ("love_car", "boy", "boy_love_car"),
                         ("love_boy", "man", "man_love_boy"),
                         ("love_boy", "boy", "boy_love_boy"),
                         ("love_car", "man", "man_love_car")
        ]

        # if do not find a phrase
        # what to do?
        train_v_data = [("love", "boy", "love_boy"),
                        ("hate", "man", "hate_man"),
                        ("hate", "boy", "hate_boy"),
                        ("love", "car", "love_car")]

        sentences = ["man_hate_boy", "car_hate_boy", "boy_hate_boy",
                     "man_hate_man", "car_hate_man", "boy_hate_man",
                     "man_love_boy", "car_love_boy", "boy_love_boy",
                     "man_love_car", "car_love_car", "boy_love_car"]
        n_mat = DenseMatrix(np.mat([[3, 4], [1, 2], [5, 6]]))

        n_space = Space(n_mat, ["man", "car", "boy"], self.ft)

        s1_mat = (vo11_mat * n_mat.transpose()).transpose()
        s2_mat = (vo12_mat * n_mat.transpose()).transpose()
        s3_mat = (vo21_mat * n_mat.transpose()).transpose()
        s4_mat = (vo22_mat * n_mat.transpose()).transpose()

        s_mat = vo11_mat.nary_vstack([s1_mat, s2_mat, s3_mat, s4_mat])
        s_space = Space(s_mat, sentences, self.ft)

        #test train 2d
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=False))
        model.train(train_vo_data, n_space, s_space)
        vo_space = model.function_space

        self.assertListEqual(vo_space.id2row, ["hate_boy", "hate_man", "love_boy", "love_car"])
        self.assertTupleEqual(vo_space.element_shape, (2, 2))
        vo11_mat.reshape((1, 4))
        np.testing.assert_array_almost_equal(vo11_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[0])
        vo12_mat.reshape((1, 4))
        np.testing.assert_array_almost_equal(vo12_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[1])
        vo21_mat.reshape((1, 4))
        np.testing.assert_array_almost_equal(vo21_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[2])
        vo22_mat.reshape((1, 4))
        np.testing.assert_array_almost_equal(vo22_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[3])

        # test train 3d
        model2 = LexicalFunction(learner=LstsqRegressionLearner(intercept=False))
        model2.train(train_v_data, n_space, vo_space)
        v_space = model2.function_space
        np.testing.assert_array_almost_equal(v_mat.mat,
                                             v_space.cooccurrence_matrix.mat)
        self.assertListEqual(v_space.id2row, ["hate", "love"])
        self.assertTupleEqual(v_space.element_shape, (2, 2, 2))

        # test compose 3d
        vo_space2 = model2.compose(train_v_data, n_space)
        id2row1 = list(vo_space.id2row)
        id2row2 = list(vo_space2.id2row)
        id2row2.sort()
        self.assertListEqual(id2row1, id2row2)
        row_list = vo_space.id2row
        vo_rows1 = vo_space.get_rows(row_list)
        vo_rows2 = vo_space2.get_rows(row_list)
        np.testing.assert_array_almost_equal(vo_rows1.mat, vo_rows2.mat, 7)
        self.assertTupleEqual(vo_space.element_shape, vo_space2.element_shape)
Beispiel #23
0
#ex18.py
#-------
from composes.utils import io_utils
from composes.composition.lexical_function import LexicalFunction

#training data
#trying to learn a "book" function
train_data = [("good_function", "car", "good_car"),
              ("good_function", "book", "good_book")]

#load argument and phrase space
arg_space = io_utils.load("./data/out/ex10.pkl")
phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")

#train a lexical function model on the data
my_comp = LexicalFunction()
my_comp.train(train_data, arg_space, phrase_space)

#apply the trained model
comp_sp1 = my_comp.compose([("good_function", "car", "good_car")], arg_space)

#apply the trained model a second time
comp_sp2 = my_comp.compose([("good_function", "good_car", "good_good_car")],
                           comp_sp1)

#print the composed spaces:
print "\nComposed space 1:"
print comp_sp1.id2row
print comp_sp1.cooccurrence_matrix

print "\nComposed space 2:"
def train_grefenstette_multistep_composer(all_vectors_file, root_dir):
    """
    Train Grefenstette et al's multistep regression VO/SVO model
    Adapted from dissect's ex19.py
    :param all_vectors_file: file containing N, V, VO and SVO vectors
    :param root_dir: where to write temp files and output
    """
    mkdirs_if_not_exists(root_dir)
    vo_composer_output_file = join(root_dir, 'vo_comp.pkl')
    svo_composer_output_file = join(root_dir, 'svo_comp.pkl')

    filename = basename(all_vectors_file)
    noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename)
    # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename)
    # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename)
    svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename)

    # this has unigrams and observed phrases
    thes = Vectors.from_tsv(all_vectors_file)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)
    # thes.to_tsv(verb_events_file,
    # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V')
    # _translate_byblo_to_dissect(verb_events_file)
    # thes.to_tsv(vo_events_file,
    #             entry_filter=lambda x: x.type == 'VO')
    # _translate_byblo_to_dissect(vo_events_file)
    thes.to_tsv(svo_events_file,
                entry_filter=lambda x: x.type == 'SVO')
    _translate_byblo_to_dissect(svo_events_file)

    train_vo_data, train_v_data = [], []
    for phrase in thes.keys():
        df = DocumentFeature.from_string(phrase)
        if df.type == 'SVO':
            train_vo_data.append((str(df[1:]), str(df[0]), str(df)))
        if df.type == 'VO':
            train_v_data.append((str(df[0]), str(df[1]), str(df)))

    # logging.info('train_vo_data %r', len(train_vo_data))
    # logging.info('train_v_data %r', len(train_v_data))

    # load N and SVO spaces
    n_space = Space.build(data=noun_events_file + '.sm',
                          cols=noun_events_file + '.cols',
                          format="sm")

    svo_space = Space.build(data=svo_events_file + '.sm',
                            cols=svo_events_file + '.cols',
                            format="sm")

    logging.info("Input SVO training space:")
    logging.info(svo_space.id2row)
    # logging.info(svo_space.cooccurrence_matrix)

    # 1. train a model to learn VO functions on train data: VO N -> SVO
    logging.info("Step 1 training")
    vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)  # Gref et al 2013, §5 says 3
    vo_model.train(train_vo_data, n_space, svo_space)
    io_utils.save(vo_model, vo_composer_output_file)

    # 2. train a model to learn V functions on train data: V N -> VO
    # where VO space: function space learned in step 1
    logging.info("Step 2 training")
    vo_space = vo_model.function_space
    v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)
    v_model.train(train_v_data, n_space, vo_space)
    io_utils.save(v_model, svo_composer_output_file)
Beispiel #25
0
#load N and SVO spaces
n_space = Space.build(data = "./data/in/ex19-n.sm",
                      cols = "./data/in/ex19-n.cols",
                      format = "sm")

svo_space = Space.build(data = "./data/in/ex19-svo.sm",
                        cols = "./data/in/ex19-svo.cols",
                        format = "sm")

print "\nInput SVO training space:" 
print svo_space.id2row
print svo_space.cooccurrence_matrix

#1. train a model to learn VO functions on train data: VO N -> SVO
print "\nStep 1 training"
vo_model = LexicalFunction(learner=LstsqRegressionLearner())
vo_model.train(train_vo_data, n_space, svo_space)

#2. train a model to learn V functions on train data: V N -> VO
# where VO space: function space learned in step 1
print "\nStep 2 training"
vo_space = vo_model.function_space
v_model = LexicalFunction(learner=LstsqRegressionLearner())
v_model.train(train_v_data, n_space, vo_space)

#print the learned model
print "\n3D Verb space"
print v_model.function_space.id2row
print v_model.function_space.cooccurrence_matrix

Beispiel #26
0
#-------
from composes.utils import io_utils
from composes.composition.lexical_function import LexicalFunction

#training data
#trying to learn a "book" function
train_data = [("good_function", "car", "good_car"),
              ("good_function", "book", "good_book")
              ]

#load argument and phrase space
arg_space = io_utils.load("./data/out/ex10.pkl")
phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")

#train a lexical function model on the data
my_comp = LexicalFunction()
my_comp.train(train_data, arg_space, phrase_space)

#apply the trained model
comp_sp1 = my_comp.compose([("good_function", "car",
                             "good_car")],
                           arg_space)

#apply the trained model a second time
comp_sp2 = my_comp.compose([("good_function", "good_car",
                             "good_good_car")],
                           comp_sp1)


#print the composed spaces:
print "\nComposed space 1:"
    pair_data = pickle.load(f)

train_data = []
vocab = set(my_space.id2row)
for tup in pair_data:
    if tup[1] in vocab and tup[2] in vocab:
        train_data.append(tup)
'''
try:
    with open('temp_func.pkl', 'rb') as file:
        print("Loading model...")
        my_comp = pickle.load(file)
except FileNotFoundError:
'''
print("Training LexicalFunction...")
my_comp = LexicalFunction()
my_comp.train(train_data, my_space, my_space)
with open('temp_func.pkl', 'wb') as file:
    pickle.dump(my_comp, file)

print("Building composed space...")
composed_space = my_comp.compose(train_data, my_space)
# print(composed_space.id2row)
# print(composed_space.cooccurrence_matrix)

# compute similarity between two words in the space

cos_sim = {}
for pair in train_data:
    cos = my_space.get_sim(pair[1],
                           pair[2],
Beispiel #28
0
#ex16.py
#-------
from composes.utils import io_utils
from composes.composition.lexical_function import LexicalFunction
from composes.similarity.cos import CosSimilarity

#training data 
#trying to learn a "good" function
train_data = [("good_function", "car", "good_car"),
              ("good_function", "book", "good_book")
              ]

#load argument and phrase space
arg_space = io_utils.load("./data/out/ex10.pkl")
phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")

#train a lexical function model on the data
my_comp = LexicalFunction()
my_comp.train(train_data, arg_space, phrase_space)

#print its parameters
print "\nLexical function space:" 
print my_comp.function_space.id2row
cooc_mat = my_comp.function_space.cooccurrence_matrix
cooc_mat.reshape(my_comp.function_space.element_shape)
print cooc_mat

#similarity within the learned functional space
print "\nSimilarity between good and good in the function space:" 
print my_comp.function_space.get_sim("good_function", "good_function", 
                                     CosSimilarity())
Beispiel #29
0
#load N and SVO spaces
n_space = Space.build(data="./data/in/ex19-n.sm",
                      cols="./data/in/ex19-n.cols",
                      format="sm")

svo_space = Space.build(data="./data/in/ex19-svo.sm",
                        cols="./data/in/ex19-svo.cols",
                        format="sm")

print "\nInput SVO training space:"
print svo_space.id2row
print svo_space.cooccurrence_matrix

#1. train a model to learn VO functions on train data: VO N -> SVO
print "\nStep 1 training"
vo_model = LexicalFunction(learner=LstsqRegressionLearner())
vo_model.train(train_vo_data, n_space, svo_space)

#2. train a model to learn V functions on train data: V N -> VO
# where VO space: function space learned in step 1
print "\nStep 2 training"
vo_space = vo_model.function_space
v_model = LexicalFunction(learner=LstsqRegressionLearner())
v_model.train(train_v_data, n_space, vo_space)

#print the learned model
print "\n3D Verb space"
print v_model.function_space.id2row
print v_model.function_space.cooccurrence_matrix

#3. use the trained models to compose new SVO sentences
Beispiel #30
0
#ex16.py
#-------
from composes.utils import io_utils
from composes.composition.lexical_function import LexicalFunction
from composes.similarity.cos import CosSimilarity

#training data
#trying to learn a "good" function
train_data = [("good_function", "car", "good_car"),
              ("good_function", "book", "good_book")]

#load argument and phrase space
arg_space = io_utils.load("./data/out/ex10.pkl")
phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl")

#train a lexical function model on the data
my_comp = LexicalFunction()
my_comp.train(train_data, arg_space, phrase_space)

#print its parameters
print "\nLexical function space:"
print my_comp.function_space.id2row
cooc_mat = my_comp.function_space.cooccurrence_matrix
cooc_mat.reshape(my_comp.function_space.element_shape)
print cooc_mat

#similarity within the learned functional space
print "\nSimilarity between good and good in the function space:"
print my_comp.function_space.get_sim("good_function", "good_function",
                                     CosSimilarity())
Beispiel #31
0
print "Applying SVD..."
space = space.apply(Svd(100))

print "Creating peripheral space.."
per_space = PeripheralSpace.build(space,
                                  data = data_path + "per.raw.SV.sm",
                                  cols = data_path + "per.raw.SV.cols",
                                  format = "sm"                                
                                  )

#reading in train data
train_data_file = data_path + "ML08_SV_train.txt"
train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2])

print "Training Lexical Function composition model..."
comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)

print "Composing phrases..."
test_phrases_file = data_path + "ML08nvs_test.txt" 
test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2])
composed_space = comp_model.compose(test_phrases, space)

print "Reading similarity test data..."
test_similarity_file = data_path + "ML08data_new.txt"
test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1])
gold = io_utils.read_list(test_similarity_file, field=2)

print "Computing similarity with lexical function..."
pred = composed_space.get_sims(test_pairs, CosSimilarity())