def eval_on_file(path_composed_emb, path_observed_emb, save_path): raw_observed_space = Space.build(data=path_observed_emb, format='dm') observed_space = raw_observed_space.apply(RowNormalization('length')) observed_words = observed_space.get_id2row() print("Observed words, size: " + str(len(observed_words)) + ", first:") print(observed_words[:10]) observed_words_set = set(observed_words) raw_composed_space = Space.build(data=path_composed_emb, format='dm') composed_space = raw_composed_space.apply(RowNormalization('length')) composed_words = composed_space.get_id2row() print("Composed words, size: " + str(len(composed_words)) + ", first:") print(composed_words[:10]) # all composed words should be in the initial space for idx, word in enumerate(composed_words): assert (word in observed_words_set) q1, q2, q3, ranks = evaluateRank(composed_words, composed_space, observed_space) print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3)) printDictToFile(ranks, save_path + '_rankedCompounds.txt') sortedRanks = sorted(ranks.values()) printListToFile(sortedRanks, save_path + '_ranks.txt') logResult(q1, q2, q3, save_path + '_quartiles.txt') return q1, q2, q3, ranks
def test_build_data(self): test_cases = [("data1",["red", "blue"], ["car", "man"], np.mat([[3,5],[0,10]]), np.mat([[3,5],[0,10]])), ("data2",["red"], ["car"], np.mat([[3]]), np.mat([[3]])), ("data3",["red", "blue"], ["car", "man"], np.mat([[15,0],[0,6]]), np.mat([[5,0],[0,6]])), ("data7",["red"], ["car"], np.mat([[0]]), np.mat([[0]])), ("data9",["man"], ["car"], np.mat([[4]]), None), ] for data_file, rows, cols, smat, dmat in test_cases: data_file1 = self.dir_ + data_file + ".sparse" sp = Space.build(data=data_file1, cols= self.dir_ + data_file + ".cols", format="sm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix) np.testing.assert_array_equal(smat, sp.cooccurrence_matrix.mat.todense()) data_file2 = self.dir_ + data_file + ".dense" if not dmat is None: sp = Space.build(data=data_file2, format="dm") self.assertListEqual(rows, sp.id2row) self.assertListEqual([], sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix) np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def train_from_core(lexical_space_file, an_dn_file, pn_file, sv_file, vo_file, output_file_prefix): if (not exists(lexical_space_file) or not exists(pn_file) or not exists(sv_file) or not exists(vo_file) or not exists(an_dn_file)): print "some file doesn't exist" print lexical_space_file, an_dn_file, pn_file, sv_file, vo_file print "load core" core_space = Space.build(data=lexical_space_file, format="dm") print "load an dn" an_dn_space = Space.build(data=an_dn_file, format="dm") print "load pn" pn_space = Space.build(data=pn_file, format="dm") print "load sv" sv_space = Space.build(data=sv_file, format="dm") print "load vo" vo_space = Space.build(data=vo_file, format="dm") print "start training" all_mat_space_normed = train_all_spaces(core_space, an_dn_space, pn_space, sv_space, vo_space) print "exporting trained file" all_mat_space_normed.export(output_file_prefix, format="dm") del all_mat_space_normed print "DONE"
def test_simple_dense(self): bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat2.dm", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s2) self._test_equal_spaces_dense(s1, s3) bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format", "pkl", "--output_format", "dm" ]) s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space) s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s3)
def setUp(self): self.ft = ["f1", "f2"] self.n_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])), ["car", "man"], self.ft) self.an_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])), ["a1_car", "a1_man"], self.ft)
def test_simple_sparse_zipped(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm", "--gz", "True" ]) s1 = Space.build(data=self.dir_ + "mat1.sm.gz", cols= self.dir_ + "mat1.cols", format = "sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm", cols=self.dir_ + "CORE_SS.mat1.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space) s4 = Space.build(data=self.dir_ + "mat1.sm", cols= self.dir_ + "mat1.cols", format = "sm") self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) self._test_equal_spaces_sparse(s1, s4)
def main(): """ Convert temporal referencing matrix to regular (binned) matrix. """ # Get the arguments args = docopt( """Convert temporal referencing matrix to regular (binned) matrix. Usage: tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath> <spacePrefix> = path to pickled space without suffix <ref> = reference string <outPath> = output path for result file Options: -w, --w2v save in w2v format -s, --sps save in sparse matrix format """) is_w2v = args['--w2v'] is_sps = args['--sps'] spacePrefix = args['<spacePrefix>'] ref = args['<ref>'] outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Load spaces space = load_pkl_files(spacePrefix) matrix = space.get_cooccurrence_matrix().get_mat() id2row = space.get_id2row() id2column = space.get_id2column() ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')] if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)] targets, indices = zip(*ti) new_matrix = matrix[list(indices), :] # Save the Space objects if is_w2v: new_space = Space(DenseMatrix(new_matrix), list(targets), id2column) save_pkl_files(new_space, outPath, save_in_one_file=True, save_as_w2v=True) if is_sps: new_space = Space(SparseMatrix(new_matrix), list(targets), id2column) save_pkl_files(new_space, outPath, save_in_one_file=True, save_as_w2v=False) logging.info("--- %s seconds ---" % (time.time() - start_time))
def eval_on_file(path_composed_emb, path_observed_emb, save_path): raw_observed_space = Space.build(data=path_observed_emb, format='dm') observed_space = raw_observed_space.apply(RowNormalization('length')) observed_words = observed_space.get_id2row() print("Observed words, size: " + str(len(observed_words)) + ", first:") print(observed_words[:10]) observed_words_set = set(observed_words) raw_composed_space = Space.build(data=path_composed_emb, format='dm') composed_space = raw_composed_space.apply(RowNormalization('length')) composed_words = composed_space.get_id2row() print("Composed words, size: " + str(len(composed_words)) + ", first:") print(composed_words[:10]) # all composed words should be in the initial space for idx, word in enumerate(composed_words): assert(word in observed_words_set) q1, q2, q3, ranks = evaluateRank(composed_words, composed_space, observed_space) print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3)) printDictToFile(ranks, save_path + '_rankedCompounds.txt') sortedRanks = sorted(ranks.values()) printListToFile(sortedRanks, save_path + '_ranks.txt') logResult(q1, q2, q3, save_path + '_quartiles.txt') return q1,q2,q3,ranks
def setUp(self): self.a = np.array([[1, 2, 3], [4, 0, 5]]) self.space_s = Space(SparseMatrix(np.mat(self.a)), ["a", "b"], ["f1", "f2", "f3"]) self.space_d = Space(DenseMatrix(np.mat(self.a)), ["a", "b"], ["f1", "f2", "f3"])
def setUp(self): self.dir_ = data_dir + "/space_test_resources/" self.init_test_cases = [(DenseMatrix(np.array([[1,2],[3,4]])), ["car", "man"], ["feat1", "feat2"], {"man":1, "car":0}, {"feat1":0, "feat2":1}, [ScalingOperation(EpmiWeighting())]), (DenseMatrix(np.array([[1,2],[3,4]])), ["car", "man"], [], {"man":1, "car":0}, {}, [ScalingOperation(EpmiWeighting())])] self.m1 = np.array([[1,2,3]]) self.row1 = ["a"] self.row2 = ["a", "b", "c"] self.ft1 = ["f1","f2","f3"] self.space1 = Space(DenseMatrix(self.m1),self.row1, self.ft1) self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]]) self.us = np.mat([[ 2.19272110e+00, 3.03174768e+00], [ 4.38544220e+00, 6.06349536e+00], [ 6.76369708e+02, -4.91431927e-02]]) self.space2 = Space(DenseMatrix(self.x), self.row2, self.ft1)
def test_simple_dense(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data = self.dir_ + "mat2.dm", format = "dm") s2 = Space.build(data = self.dir_ + "CORE_SS.mat2.dm", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s2) self._test_equal_spaces_dense(s1, s3) bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format", "pkl", "--output_format", "dm" ]) s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space) s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s3)
def test_as_conversion_tool(self): bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat3.sm", cols= self.dir_ + "mat3.cols", format = "sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_dense(s1, s2) s3.to_dense() self._test_equal_spaces_dense(s1, s3) bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) s3.to_dense() self._test_equal_spaces_dense(s1, s3)
def test_simple_lstsq_no_inter(self): tc.main(["train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space = trained.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([1,0,0,1]), 10) self.assertTupleEqual(new_space.element_shape, (2,2)) self.assertListEqual(new_space.id2row, ["big"]) self.assertListEqual(new_space.id2column, []) a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space) tc.main(["train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0", "--crossvalidation", "False", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space2 = trained.function_space np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat, np.mat([1,0,0,1]), 10) self.assertTupleEqual(new_space2.element_shape, (2,2)) self.assertListEqual(new_space2.id2row, ["big"]) self.assertListEqual(new_space2.id2column, []) a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space2)
def test_simple_ops(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-w", "raw", "-s", "top_sum_3,top_length_3,top_sum_4", "-r", "svd_2,svd_1", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) core_mats = ["CORE_SS.mat3.raw.top_sum_3.svd_2", "CORE_SS.mat3.raw.top_sum_3.svd_1", "CORE_SS.mat3.raw.top_length_3.svd_2", "CORE_SS.mat3.raw.top_length_3.svd_1", "CORE_SS.mat3.raw.top_sum_4.svd_2", "CORE_SS.mat3.raw.top_sum_4.svd_1" ] core_spaces = [Space.build(data=self.dir_ + suffix + ".dm", format="dm") for suffix in core_mats] for i, core_mat in enumerate(core_mats): bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-o", self.dir_, "-c", self.dir_ + core_mat + ".pkl", "--input_format", "dm", "--output_format", "dm" ]) s1 = core_spaces[i] data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm" s2 = Space.build(data=data_file, format="dm") self._test_equal_spaces_dense(s1, s2) bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat3", "-o", self.dir_, "-c", self.dir_ + core_mat + ".pkl", "--input_format", "sm", "--output_format", "dm" ]) s1 = core_spaces[i] data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm" s2 = Space.build(data=data_file, format="dm") self._test_equal_spaces_dense(s1, s2)
def test_dilation(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = Dilation() m.export(self.prefix + ".dil1") m.train([("a", "b", "a_b")], self.space1, self.space2) m.export(self.prefix + ".dil2")
def test_weighted_additive(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = WeightedAdditive() m.export(self.prefix + ".add1") m.train([("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".add2")
def test_simple_dense(self): bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "-c", self.dir_ + "CORE_SS.mat2.pkl", "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm") s2 = Space.build(data=self.dir_ + "PER_SS.mat2.CORE_SS.mat2.dm", format="dm") self._test_equal_spaces_dense(s1, s2)
def test_train_intercept(self): a1_mat = DenseMatrix(np.mat([[3, 4], [5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2], [3, 4]])) train_data = [("a1", "man", "a1_man"), ("a2", "car", "a2_car"), ("a1", "boy", "a1_boy"), ("a2", "boy", "a2_boy") ] n_mat = DenseMatrix(np.mat([[13, 21], [3, 4], [5, 6]])) n_space = Space(n_mat, ["man", "car", "boy"], self.ft) an1_mat = (a1_mat * n_mat.transpose()).transpose() an2_mat = (a2_mat * n_mat.transpose()).transpose() an_mat = an1_mat.vstack(an2_mat) an_space = Space(an_mat, ["a1_man", "a1_car", "a1_boy", "a2_man", "a2_car", "a2_boy"], self.ft) #test train model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True)) model.train(train_data, n_space, an_space) a_space = model.function_space a1_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a1_mat.mat, # a_space.cooccurrence_matrix.mat[0]) a2_mat.reshape((1, 4)) #np.testing.assert_array_almost_equal(a2_mat.mat, # a_space.cooccurrence_matrix.mat[1]) self.assertListEqual(a_space.id2row, ["a1", "a2"]) self.assertTupleEqual(a_space.element_shape, (2, 3)) #test compose a1_mat = DenseMatrix(np.mat([[3, 4, 5, 6]])) a2_mat = DenseMatrix(np.mat([[1, 2, 3, 4]])) a_mat = a_space.cooccurrence_matrix a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2, 3)) model = LexicalFunction(function_space=a_space, intercept=True) comp_space = model.compose(train_data, n_space) self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"]) self.assertListEqual(comp_space.id2column, []) self.assertEqual(comp_space.element_shape, (2,)) np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat, an_mat[[0, 4, 2, 5]].mat, 8)
def test_simple_lstsq_no_inter(self): tc.main([ "train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load( self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space = trained.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10) self.assertTupleEqual(new_space.element_shape, (2, 2)) self.assertListEqual(new_space.id2row, ["big"]) self.assertListEqual(new_space.id2column, []) a_space = Space.build( data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space) tc.main([ "train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0", "--crossvalidation", "False", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load( self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space2 = trained.function_space np.testing.assert_array_almost_equal( new_space2.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10) self.assertTupleEqual(new_space2.element_shape, (2, 2)) self.assertListEqual(new_space2.id2row, ["big"]) self.assertListEqual(new_space2.id2column, []) a_space = Space.build( data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space2)
def test_as_conversion_tool(self): bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat3.sm", cols=self.dir_ + "mat3.cols", format="sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_dense(s1, s2) s3.to_dense() self._test_equal_spaces_dense(s1, s3) bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) s3.to_dense() self._test_equal_spaces_dense(s1, s3)
def test_lexical_function(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = LexicalFunction() m._MIN_SAMPLES = 1 self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1") m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".lf2")
def test_full_additive(self): self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]])) self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]])) self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]])) self.row = ["a", "b"] self.ft = ["f1", "f2"] self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft) self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft) m = FullAdditive() self.assertRaises(IllegalStateError, m.export, self.prefix + ".full1") m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1, self.space2) m.export(self.prefix + ".full2")
def test_vstack_raises(self): space3 = Space(DenseMatrix(self.x[0:2,0:1]), ["e","f"], self.ft1[0:1]) space4 = Space(DenseMatrix(self.x[0:2,:]), ["a","f"], self.ft1) space5 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], []) space6 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], ["f1","f2","f4"]) test_cases = [(self.space2, space3), (self.space2, space4), (self.space2, space5), (self.space2, space6) ] for space1, space2 in test_cases: self.assertRaises(ValueError, space1.vstack, space1, space2)
def test_to_dissect_sparse_files(vectors_c, tmpdir): """ :type vectors_c: Thesaurus :type tmpdir: py.path.local """ from composes.semantic_space.space import Space prefix = str(tmpdir.join('output')) vectors_c.to_dissect_sparse_files(prefix) # check that files are there for suffix in ['sm', 'rows', 'cols']: outfile = '{}.{}'.format(prefix, suffix) assert os.path.exists(outfile) assert os.path.isfile(outfile) # check that reading the files in results in the same matrix space = Space.build(data="{}.sm".format(prefix), rows="{}.rows".format(prefix), cols="{}.cols".format(prefix), format="sm") matrix, rows, cols = space.cooccurrence_matrix.mat, space.id2row, space.id2column exp_matrix, exp_cols, exp_rows = vectors_c.to_sparse_matrix() assert exp_cols == cols assert exp_rows == rows assert_array_equal(exp_matrix.A, matrix.A) _assert_matrix_of_thesaurus_c_is_as_expected(matrix.A, rows, cols) _assert_matrix_of_thesaurus_c_is_as_expected(exp_matrix.A, exp_rows, exp_cols)
def load_pkl_files(dsm_prefix): """ Load the space from either a single pkl file or numerous files. :param dsm_prefix: :param dsm: """ # Check whether there is a single pickle file for the Space object if os.path.isfile(dsm_prefix + '.pkl'): return io_utils.load(dsm_prefix + '.pkl') # Load the multiple files: npz for the matrix and pkl for the other data members of Space with np.load(dsm_prefix + 'cooc.npz') as loader: coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) cooccurrence_matrix = SparseMatrix(csr_matrix(coo)) with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in: row2id = pickle.load(f_in) with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in: id2row = pickle.load(f_in) with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in: column2id = pickle.load(f_in) with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in: id2column = pickle.load(f_in) return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
def main(): parser = argparse.ArgumentParser( description="Converts a vecf file to dissect pkl format.") parser.add_argument('--input', '-i', type=argparse.FileType('r'), help='Input file') parser.add_argument('--output', '-o', type=argparse.FileType('w'), help='Output file') args = parser.parse_args() header = args.input.readline().rstrip() vocab_s, dims = map(int, header.split(" ")) vocab = [] # init matrix matrix = np.zeros((vocab_s, dims), dtype=np.float) for i, line in enumerate(args.input): data = line.split() vector = np.array(map(float, data[1:])) word = data[0] vocab.append(word) matrix[i] = vector dm = DenseMatrix(matrix) sp = Space(dm, vocab, []) pickle.dump(sp, args.output) args.output.close()
def main(): """ Transform EPMI matrix in npz format to SPPMI space and save as pickle file. """ # Get the arguments args = docopt( '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file. Usage: transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k> <spacePrefix> = path to npz without suffix <outPath> = output path for space <k> = shifting parameter ''') spacePrefix = args['<spacePrefix>'] outPath = args['<outPath>'] k = int(args['<k>']) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get npz matrix with np.load(spacePrefix + '.npz') as loader: matrix = csr_matrix( (loader['data'], loader['indices'], loader['indptr']), shape=loader['shape']) with open(spacePrefix + '.words.vocab') as f: id2row = vocab = [line.strip() for line in f if len(line) > 0] with open(spacePrefix + '.contexts.vocab') as f: id2column = [line.strip() for line in f if len(line) > 0] # Apply log weighting matrix.data = np.log(matrix.data) # Shift values matrix.data -= np.log(k) # Eliminate negative counts matrix.data[matrix.data <= 0] = 0.0 # Eliminate zero counts matrix.eliminate_zeros() # Create new space sparseSpace = Space(SparseMatrix(matrix), id2row, id2column) #print sparseSpace.get_cooccurrence_matrix() # Save the Space object in pickle format save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True) logging.info("--- %s seconds ---" % (time.time() - start_time))
def vstack(s1, s2): if not s1: return s2 if not s2: return s1 else: return Space.vstack(s1, s2)
def add_zero_idenity_matrix(matrix_space, vector_length): zero_mat = np.zeros((1,vector_length * vector_length)) identity_mat = np.reshape(np.eye(vector_length),(1, vector_length * vector_length)) matrix = DenseMatrix(np.vstack([zero_mat, identity_mat])) rows = ["cg.zeromat","cg.identmat"] additional_space = Space(matrix, rows, []) return Space.vstack(matrix_space, additional_space)
def build_raw_per_space(in_file_prefix, in_format, is_gz): if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) data_file = "%s.%s" % (in_file_prefix, in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = "%s.gz" % data_file row_file = "%s.rows" % (in_file_prefix) column_file = "%s.cols" % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print "Building matrix..." space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) return space
def build_raw_per_space(in_file_prefix, in_format, is_gz): if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) data_file = '%s.%s' % (in_file_prefix, in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print("Building matrix...") space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) return space
def main(): parser = argparse.ArgumentParser( 'Converts a VW topic output to a COMPOSES pkl file.') parser.add_argument('--input', '-i', type=argparse.FileType('r'), help='Input file') parser.add_argument('--docnames', '-d', type=argparse.FileType('r'), help='Docnames file') parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, help='Output file') args = parser.parse_args() docnames = [l for l in (l.strip() for l in args.docnames) if l] matrix = None for i, line in enumerate(args.input): line = line.strip() weights = map(float, line.split(" ")) if matrix is None: matrix = np.zeros((len(docnames), len(weights)), dtype=np.float) weights = np.array(weights) matrix[i] = weights dm = DenseMatrix(matrix) sp = Space(dm, docnames, []) pickle.dump(sp, args.output) args.output.close()
def read_mikolov(spacefile): header = spacefile.readline().rstrip() vocab_s, dims = map(int, header.split(" ")) vocab = [] # init matrix matrix = np.zeros((vocab_s, dims), dtype=np.float) i = 0 while True: line = spacefile.readline() if not line: break sep = line.find(" ") if sep == -1: raise ValueError( "Couldn't find the vocab/data separation character! Space file corruption?" ) word = line[:sep] data = line[sep + 1:] if len(data) < FLOAT_SIZE * dims + 1: data += spacefile.read(FLOAT_SIZE * dims + 1 - len(data)) data = data[:-1] vocab.append(word) vector = (struct.unpack("%df" % dims, data)) matrix[i] = vector i += 1 dm = DenseMatrix(matrix) sp = Space(dm, vocab, []) return sp
def test_build_data_row_col(self): test_cases = [("data1", "row1.row", "col1.col", ["red"], ["man", "car"], np.mat([[5,3]]), np.mat([[3,5]])), ("data1", "row1.row", "col5.col", ["red"], ["man", "car"], np.mat([[5,3]]), np.mat([[3,5]])), ("data3", "row2.row", "col2.col", ["blue", "red"], ["car"], np.mat([[0],[15]]), None), ("data2", "row1.row","col1.col", ["red"], ["man","car"], np.mat([[0,3]]), None), ("data3", "row3.row", "col3.col", ["blue", "red"], ["man", "car"], np.mat([[6,0],[0,15]]), np.mat([[0,6],[5,0]])), ("data7", "row2.row", "col3.col", ["blue", "red"], ["man", "car"], np.mat([[0,0],[0,0]]), None), ("data3", "row2.row", "col4.col", ["blue", "red"], ["airplane"], np.mat([[0],[0]]), None) ] for data_file, row_file, col_file, rows, cols, smat, dmat in test_cases: row_file = self.dir_ + row_file col_file = self.dir_ + col_file data_file1 = self.dir_ + data_file + ".sparse" if smat is None: self.assertRaises(ValueError, Space.build, data=data_file1, rows= row_file, cols=col_file, format="sm") else: sp = Space.build(data=data_file1, rows= row_file, cols=col_file, format="sm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix) np.testing.assert_array_equal(smat, sp.cooccurrence_matrix.mat.todense()) data_file2 = self.dir_ + data_file + ".dense" if dmat is None: self.assertRaises(ValueError, Space.build, data=data_file2, rows= row_file, cols=col_file, format="dm") else: sp = Space.build(data=data_file2, rows= row_file, cols=col_file, format="dm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix) np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def learn_TENSOR_matrix ( ) : bigram_space = load_space(args.function[2]) my_comp_list = [] id2row_list = [] adj_list = extract_adj(bigram_space) for adj in adj_list : train_data=[] for bigram in bigram_space.id2row : pair = bigram.split('_') if( not pair[0] == adj ) : continue train_data.append(("ADJ"+"_"+adj, pair[1], bigram)) # eg ( "ADJ_good", "boy", "good_boy"), where "ADJ_good" -> matrix to learn, boy -> unigram , good_boy -> bigram my_comp=LexicalFunction() # 1) #Learn ADJ matrix for each adjective my_comp.train(train_data, unigram_space, bigram_space) my_comp_list.append(my_comp.function_space.cooccurrence_matrix) id2row_list.append(my_comp.function_space.id2row) my_mat_id2row=id2row_list.pop() my_mat_space=Space(my_comp_list.pop(),my_mat_id2row,[]) #Create a new space using the ADJ matrices created for i in range(len(id2row_list)): my_mat_id2row.extend(id2row_list[i]) my_mat_space=Space(my_mat_space.cooccurrence_matrix.vstack(my_comp_list[i]),my_mat_id2row,[]) my_mat_space._element_shape = my_comp.function_space.element_shape #Use the ADJ matrices space to learn the tensor matrix train_data=[('tens_adj',adj,"ADJ"+"_"+adj) for adj in adj_list] # eg ( "tens_adj", good, ADJ_good ) #where "tens_adj" -> tensor matrix to learn, good -> unigram , ADJ_good -> adjective matrix learnt by 'my_comp' in 1) my_tens_adj=LexicalFunction() my_tens_adj.train(train_data, unigram_space, my_mat_space) # unigram_space -> for "good" , my_mat_space -> for "ADJ_good" save_space(my_tens_adj, "TENSOR_matrix", "matrices")
def add_one_zero_vector(core_space): length = core_space.cooccurrence_matrix.shape[1] zero_vector = np.zeros((1,length)) one_vector = np.ones((1,length)) matrix = DenseMatrix(np.vstack([zero_vector, one_vector])) rows = ["cg.zerovec","cg.onevec"] additional_space = Space(matrix, rows, []) return Space.vstack(core_space, additional_space)
def compose(self, data, arg_space): """ Uses a composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the elements to be composed and composed_phrase is the string associated to their composition. arg_space: argument space(s). Space object or a tuple of two Space objects (e.g. my_space, or (my_space1, my_space2)). If two spaces are provided, arg1 elements of data are interpreted in space1, and arg2 in space2. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() arg1_space, arg2_space = self.extract_arg_spaces(arg_space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, (arg1_space.row2id, arg2_space.row2id, None)) # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead # the /3.0 is needed # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector) chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list)) * self.MAX_MEM_OVERHEAD / 3.0) + 1 composed_mats = [] for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))): beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list)) arg1_mat = arg1_space.get_rows(arg1_list[beg:end]) arg2_mat = arg2_space.get_rows(arg2_list[beg:end]) [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) composed_mat = self._compose(arg1_mat, arg2_mat) composed_mats.append(composed_mat) composed_phrase_mat = composed_mat.nary_vstack(composed_mats) if self.composed_id2column is None: self.composed_id2column = self._build_id2column(arg1_space, arg2_space) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0]) log.print_matrix_info(logger, composed_phrase_mat, 4, "Resulted (composed) semantic space::") log.print_time_info(logger, time.time(), start, 2) return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
def test_init1(self): for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases: space_ = Space(m, id2row, id2col) self.assertIs(m, space_.cooccurrence_matrix) self.assertIs(id2row, space_.id2row) self.assertIs(id2col, space_.id2column) self.assertDictEqual(row2id, space_.row2id) self.assertDictEqual(col2id, space_.column2id) self.assertListEqual([], space_.operations)
def compose(self, data, arg_space): """ Uses a lexical function composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (function_word, arg, composed_phrase). function_word and arg are the elements to be composed and composed_phrase is the string associated to their composition. function_word elements are interpreted in self.function_space. arg_space: argument space, of type Space. arg elements of data are interpreted in this space. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() assert_is_instance(arg_space, Space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists( data, (self._function_space.row2id, arg_space.row2id, None)) composed_vec_list = [] for i in range(len(arg1_list)): arg1_vec = self._function_space.get_row(arg1_list[i]) arg2_vec = arg_space.get_row(arg2_list[i]) matrix_type = get_type_of_largest([arg1_vec, arg2_vec]) [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec], matrix_type) composed_ph_vec = self._compose(arg1_vec, arg2_vec, self._function_space.element_shape) composed_vec_list.append(composed_ph_vec) result_element_shape = self._function_space.element_shape[0:-1] composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list)) log.print_info( logger, 3, "Functional shape of the resulted (composed) elements:%s" % (result_element_shape, )) log.print_matrix_info(logger, composed_ph_mat, 4, "Resulted (composed) semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(composed_ph_mat, phrase_list, self.composed_id2column, element_shape=result_element_shape)
def setUp(self): self.m1 = np.array([[1, 2, 3]]) self.row1 = ["a"] self.ft1 = ["f1", "f2", "f3"] self.space1 = Space(DenseMatrix(self.m1), self.row1, self.ft1) self.m2 = np.array([[4, 2, 6]]) self.row2 = ["b"] self.row3 = ["a", "b", "c"] self.x = np.mat([[1, 2, 3], [2, 4, 6], [4, 675, 43]]) self.us = np.mat([[2.19272110e+00, 3.03174768e+00], [4.38544220e+00, 6.06349536e+00], [6.76369708e+02, -4.91431927e-02]]) self.us2 = np.mat([[2.19272110e+00], [4.38544220e+00], [6.76369708e+02]]) self.space2 = Space(DenseMatrix(self.x), self.row3, self.ft1)
def test_simple_sparse(self): bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat1.sm", cols=self.dir_ + "mat1.cols", format="sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm", cols=self.dir_ + "CORE_SS.mat1.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3)
def test_init4(self): for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases: space_ = Space(m, id2row, id2col, row2id, col2id, operations = ops) self.assertIs(m, space_.cooccurrence_matrix) self.assertIs(id2row, space_.id2row) self.assertIs(id2col, space_.id2column) self.assertIs(row2id, space_.row2id) self.assertIs(col2id, space_.column2id) self.assertIs(ops, space_.operations)
def test_simple_sparse(self): bps.main(["build_peripheral_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "-c", self.dir_ + "CORE_SS.mat1.pkl", "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat1.sm", cols=self.dir_ + "mat1.cols", format="sm") s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm", cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols", format="sm") self._test_equal_spaces_sparse(s1, s2)
def to_dissect_core_space(self): """ Converts this object to a composes.semantic_space.space.Space :rtype: composes.semantic_space.space.Space """ from composes.matrix.sparse_matrix import SparseMatrix from composes.semantic_space.space import Space mat, cols, rows = self.to_sparse_matrix() mat = SparseMatrix(mat) s = Space(mat, rows, cols) # test that the mapping from string to its vector has not been messed up for i in range(min(10, len(self))): s1 = s.get_row(rows[i]).mat s2 = self.v.transform(dict(self[rows[i]])) # sparse matrices do not currently support equality testing assert abs(s1 - s2).nnz == 0 return s
def fit(self, train_pairs, verbose=False): AdditiveModel.fit(self, train_pairs, verbose=verbose) if verbose: print 'fit: Fitting a weighted additive model on %d pairs' % (len(train_pairs)) # First, we embed the derived vector into the original space (by simply adding a row) vec_space = Space(self.diff_vector, ['pattern_vector'], []) self.new_space = Space.vstack(self.space, vec_space) # class is designed to be run on a dataset with different function words (==patterns). # We use a dummy function word here. train_pairs_ext = [(base, 'pattern_vector', derived) for (base, derived) in train_pairs] self.weighted_additive.train(train_pairs_ext, self.new_space, self.new_space)
def build_unigram_space() : unigram_space = Space.build(data = args.function[3], rows = args.function[2], cols = args.function[1], format = "sm") ppmi_space = ppmi(unigram_space) ppmi_norm_space = norm(ppmi_space) ppmi_norm_svd_space = svd(ppmi_norm_space) save_space(ppmi_norm_svd_space, "unigrams_space") return ppmi_norm_svd_space
def write_pkl(self): """ Create spaces from co-occurrence counts in sparse format (.sm) """ # For direction DE-EN my_space_1 = Space.build( data=OUTPUT_FILE_DE_DE_EN_SM, rows=OUTPUT_FILE_DE_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm" ) # For direction EN-DE my_space_2 = Space.build( data=OUTPUT_FILE_EN_EN_DE_SM, rows=OUTPUT_FILE_EN_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm" ) # Save the space objects in pickle format io_utils.save(my_space_1, OUTPUT_FILE_DE_DE_EN_PKL) io_utils.save(my_space_2, OUTPUT_FILE_EN_EN_DE_PKL) print >> stderr, "Pickle file 1 written out:", OUTPUT_FILE_DE_DE_EN_PKL print >> stderr, "Pickle file 2 written out:", OUTPUT_FILE_EN_EN_DE_PKL
def test_simple_load(self): #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") #new_space = trained.function_space ac.main(["apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model", self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ] ) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm", format="dm") sp2 = Space.build(data=self.dir_ + "AN_mat.dm", format="dm") self._test_equal_spaces_dense(sp1, sp2)
def inspect_representations(path_composed_emb, output_path): print('Inspecting representations...') composed_space = Space.build(data=path_composed_emb, format='dm') f = codecs.open(output_path, 'w', 'utf8') word_list=[w for w in composed_space.get_row2id()] for j, w in enumerate(word_list): if j < 1000: neighbours = composed_space.get_neighbours(w, 10, CosSimilarity()) f.write('Neighbours for ' + w + '\n') f.write("\n".join('%s %.6f' % x for x in neighbours)) f.write('\n----------------------------\n') f.close()
def test_simple_nmf(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log_nmf.txt", "-i", self.dir_ + "mat3", "-w", "raw", "-r", "nmf_2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.nmf_2.dm", format="dm") self.assertEqual(s1.cooccurrence_matrix.mat.shape, (3,2))
def train_all_spaces(core_space, an_dn_space, pn_space, sv_space, vo_space): core_space = core_space.apply(RowNormalization()) print "train adj, det" a_d_space = train_one_space(core_space, an_dn_space, 0, 3) print "train prep" prep_space = train_one_space(core_space, pn_space, 1, 3) print "train vo" v_obj_space = train_one_space(core_space, vo_space, 0, 4) print "train sv" v_subj_space = train_one_space(core_space, sv_space, 1, 4) new_v_obj_rows = [row + ".objmat" for row in v_obj_space.id2row] v_obj_space._id2row = new_v_obj_rows v_obj_space._row2id = list2dict(new_v_obj_rows) new_v_subj_rows = [row + ".subjmat" for row in v_subj_space.id2row] v_subj_space._id2row = new_v_subj_rows v_subj_space._row2id = list2dict(new_v_subj_rows) all_mat_space = Space.vstack(a_d_space, prep_space) all_mat_space = Space.vstack(v_obj_space, all_mat_space) all_mat_space = Space.vstack(v_subj_space, all_mat_space) return all_mat_space
def test_build_data_row(self): test_cases = [("data1", "row1.row", ["red"], ["car", "man"], np.mat([[3,5]]), np.mat([[3,5]])), ("data2", "row1.row",["red"], ["car"], np.mat([[3]]), np.mat([[3]])), ("data3", "row2.row", ["blue", "red"], ["car", "man"], np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])), ("data3", "row3.row", ["blue", "red"], ["car", "man"], np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])), ("data7", "row2.row", ["blue", "red"], ["car"], np.mat([[0],[0]]), np.mat([[0],[0]])), ] for data_file, row_file, rows, cols, smat, dmat in test_cases: row_file = self.dir_ + row_file data_file1 = self.dir_ + data_file + ".sparse" sp = Space.build(data=data_file1, rows= row_file, cols= self.dir_ + data_file + ".cols", format="sm") self.assertListEqual(rows, sp.id2row) self.assertListEqual(cols, sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix) np.testing.assert_array_equal(smat, sp.cooccurrence_matrix.mat.todense()) data_file2 = self.dir_ + data_file + ".dense" sp = Space.build(data=data_file2, rows= row_file, format="dm") self.assertListEqual(rows, sp.id2row) self.assertListEqual([], sp.id2column) self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix) np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, selections, reductions, normalizations, is_gz): in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1] data_file = '%s.%s' % (in_file_prefix, in_format) if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print "Building matrix..." space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) for w in weightings: w_space = apply_weighting(space, w) for s in selections: s_space = apply_selection(w_space, s) for r in reductions: r_space = apply_reduction(s_space, r) for n in normalizations: n_space = apply_normalization(r_space, n) print "Printing..." print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
from composes.semantic_space.space import Space from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.scaling.row_normalization import RowNormalization from composes.transformation.dim_reduction.svd import Svd; import sys #create a space from co-occurrence counts in sparse format my_space = Space.build(data = "../data/"+sys.argv[1]+".sm", rows = "../data/"+sys.argv[1]+".rows", cols = "../data/"+sys.argv[1]+".cols", format = "sm") my_space = my_space.apply(PpmiWeighting()) my_space = my_space.apply(RowNormalization()) #apply svd reduction my_space = my_space.apply(Svd(1500)) #export the space in dense format and pkl format my_space.export("../spaces/"+sys.argv[1], format = "dm") io_utils.save(my_space, "../spaces/"+sys.argv[1]+".pkl")
from subprocess import Popen, PIPE import os import time usage = """ Usage: python dissect.py dissect_format_file_name dissect_format_file_name: path to a file containing dissect format """ CMD_EXTRACTOR_SCRIPT = '~/Programming/terminology_extractor/extract_patterns.py' file_name = sys.argv[1] my_space = Space.build(data = file_name+".sm", rows = file_name+".rows", cols = file_name+".cols", format = "sm") my_space = my_space.apply(PpmiWeighting()) # print my_space.get_sim("spain", "netherlands", CosSimilarity()) # print my_space.get_neighbours('parenchymopbouw', 4, CosSimilarity()) # print my_space.get_neighbours('pension-n', 4, CosSimilarity()) # print my_space.id2row def prettify(elem): """ Return a pretty-printed XML string for the Element. """ rough_string = ElementTree.tostring(elem, 'utf-8') reparsed = minidom.parseString(rough_string)