def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files): sim_dict = { "cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity() } if not sim_measure in sim_dict: raise ValueError("Similarity measure:%s not defined" % sim_measure) space = io_utils.load(space_files[0], Space) space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) sim = sim_dict[sim_measure] descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr]) out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) data = io_utils.read_list(in_file) print("Computing neighbours: %s" % sim_measure) with open(out_file, "w") as out_stream: for word in data: out_stream.write("%s\n" % word) result = space.get_neighbours(word, no_neighbours, sim, space2) for neighbour, neighbour_sim in result: out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files): sim_dict = {"cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity()} if not sim_measure in sim_dict: raise ValueError("Similarity measure:%s not defined" % sim_measure) space = io_utils.load(space_files[0], Space) space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) sim = sim_dict[sim_measure] descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr]) out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) data = io_utils.read_list(in_file) print "Computing neighbours: %s" % sim_measure with open(out_file,"w") as out_stream: for word in data: out_stream.write("%s\n" % word) result = space.get_neighbours(word, no_neighbours, sim, space2) for neighbour, neighbour_sim in result: out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
def apply_model(in_file, out_dir, model, trained_model, arg_space_files, alpha, beta, lambda_, out_format): print("Reading in data...") in_descr = in_file.split("/")[-1] if not model is None: model_obj = create_model(model, alpha, beta, lambda_) else: model_obj = io_utils.load(trained_model, CompositionModel) model_descr = type(model_obj).__name__ arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print("Applying composition model:%s" % model_descr) if arg_space2 is None or type(model_obj) is LexicalFunction: composed_space = model_obj.compose(data, arg_space) else: composed_space = model_obj.compose(data, (arg_space, arg_space2)) print("Printing...") out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr]) io_utils.save(composed_space, "%s.pkl" % out_file) if not out_format is None: composed_space.export(out_file, format=out_format)
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file, regression, crossvalid, intercept, param, param_range, export_params): print "Reading in data..." in_descr = in_file.split("/")[-1] model_dict = { "weighted_add": WeightedAdditive, "full_add": FullAdditive, "lexical_func": LexicalFunction, "dilation": Dilation } learner_dict = { "ridge": RidgeRegressionLearner, "lstsq": LstsqRegressionLearner } arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) phrase_space = io_utils.load(phrase_space_file, Space) if not model in model_dict: raise ValueError("Invalid model:%s for training" % model) model_cls = model_dict[model] if model_cls in (WeightedAdditive, Dilation): model_obj = model_cls() else: if regression == "ridge": regression_obj = learner_dict[regression]( crossvalidation=crossvalid, intercept=intercept, param=param, param_range=param_range) model_obj = model_cls(learner=regression_obj) elif regression == "lstsq": regression_obj = learner_dict[regression](intercept=intercept) model_obj = model_cls(learner=regression_obj) else: model_obj = model_cls() train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print "Training %s model" % model if arg_space2 is None or model == "lexical_func": model_obj.train(train_data, arg_space, phrase_space) else: model_obj.train(train_data, (arg_space, arg_space2), phrase_space) print "Printing..." out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr]) io_utils.save(model_obj, "%s.pkl" % out_file) if export_params: model_obj.export("%s.params" % out_file)
def test_simple_dense(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data = self.dir_ + "mat2.dm", format = "dm") s2 = Space.build(data = self.dir_ + "CORE_SS.mat2.dm", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s2) self._test_equal_spaces_dense(s1, s3) bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format", "pkl", "--output_format", "dm" ]) s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space) s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s3)
def apply_model(in_file, out_dir, model, trained_model, arg_space_files, alpha, beta, lambda_, out_format): print "Reading in data..." in_descr = in_file.split("/")[-1] if not model is None: model_obj = create_model(model, alpha, beta, lambda_) else: model_obj = io_utils.load(trained_model, CompositionModel) model_descr = type(model_obj).__name__ arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print "Applying composition model:%s" % model_descr if arg_space2 is None or type(model_obj) is LexicalFunction: composed_space = model_obj.compose(data, arg_space) else: composed_space = model_obj.compose(data, (arg_space, arg_space2)) print "Printing..." out_file = ".".join([out_dir + "/COMPOSED_SS", model_descr, in_descr]) io_utils.save(composed_space, "%s.pkl" % out_file) if not out_format is None: composed_space.export(out_file, format=out_format)
def test_simple_dense(self): bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat2.dm", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s2) self._test_equal_spaces_dense(s1, s3) bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format", "pkl", "--output_format", "dm" ]) s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space) s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space) self._test_equal_spaces_dense(s1, s3)
def functionneighbours(words,number): #load a space if sys.argv[2]=='full': my_space = io_utils.load("./data/out/thesisfull.pkl") if sys.argv[2]=='nonzero': my_space = io_utils.load("./data/out/thesis.pkl") return(my_space.get_neighbours(words,number, CosSimilarity()))
def train_model(in_file, out_dir, model, arg_space_files, phrase_space_file, regression, crossvalid, intercept, param, param_range, export_params): print "Reading in data..." in_descr = in_file.split("/")[-1] model_dict = {"weighted_add": WeightedAdditive, "full_add": FullAdditive, "lexical_func": LexicalFunction, "dilation": Dilation } learner_dict = {"ridge": RidgeRegressionLearner, "lstsq": LstsqRegressionLearner } arg_space = io_utils.load(arg_space_files[0], Space) arg_space2 = None if len(arg_space_files) == 2: arg_space2 = io_utils.load(arg_space_files[1], Space) phrase_space = io_utils.load(phrase_space_file, Space) if not model in model_dict: raise ValueError("Invalid model:%s for training" % model) model_cls = model_dict[model] if model_cls in (WeightedAdditive, Dilation): model_obj = model_cls() else: if regression == "ridge": regression_obj = learner_dict[regression](crossvalidation=crossvalid, intercept=intercept, param=param, param_range=param_range) model_obj = model_cls(learner=regression_obj) elif regression == "lstsq": regression_obj = learner_dict[regression](intercept=intercept) model_obj = model_cls(learner=regression_obj) else: model_obj = model_cls() train_data = io_utils.read_tuple_list(in_file, fields=[0, 1, 2]) print "Training %s model" % model if arg_space2 is None or model == "lexical_func": model_obj.train(train_data, arg_space, phrase_space) else: model_obj.train(train_data, (arg_space, arg_space2), phrase_space) print "Printing..." out_file = ".".join([out_dir + "/TRAINED_COMP_MODEL", model, in_descr]) io_utils.save(model_obj, "%s.pkl" % out_file) if export_params: model_obj.export("%s.params" % out_file)
def test_as_conversion_tool(self): bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat3.sm", cols= self.dir_ + "mat3.cols", format = "sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_dense(s1, s2) s3.to_dense() self._test_equal_spaces_dense(s1, s3) bcs.main(["build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format = "dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) s3.to_dense() self._test_equal_spaces_dense(s1, s3)
def test_simple_lstsq_no_inter(self): tc.main(["train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space = trained.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([1,0,0,1]), 10) self.assertTupleEqual(new_space.element_shape, (2,2)) self.assertListEqual(new_space.id2row, ["big"]) self.assertListEqual(new_space.id2column, []) a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space) tc.main(["train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0", "--crossvalidation", "False", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space2 = trained.function_space np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat, np.mat([1,0,0,1]), 10) self.assertTupleEqual(new_space2.element_shape, (2,2)) self.assertListEqual(new_space2.id2row, ["big"]) self.assertListEqual(new_space2.id2column, []) a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space2)
def test_simple_lstsq_no_inter(self): tc.main([ "train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load( self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space = trained.function_space np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10) self.assertTupleEqual(new_space.element_shape, (2, 2)) self.assertListEqual(new_space.id2row, ["big"]) self.assertListEqual(new_space.id2column, []) a_space = Space.build( data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space) tc.main([ "train_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0", "--crossvalidation", "False", "--intercept", "False", "--export_params", "True" ]) trained = io_utils.load( self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") new_space2 = trained.function_space np.testing.assert_array_almost_equal( new_space2.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10) self.assertTupleEqual(new_space2.element_shape, (2, 2)) self.assertListEqual(new_space2.id2row, ["big"]) self.assertListEqual(new_space2.id2column, []) a_space = Space.build( data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", format="dm") self._test_equal_spaces_dense(a_space, new_space2)
def compute_sim(in_file, columns, out_dir, sim_measures, space_files): sim_dict = { "cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity() } if not len(columns) == 2: raise ValueError("Column description unrecognized!") col0 = int(columns[0]) - 1 col1 = int(columns[1]) - 1 try: space = io_utils.load(space_files[0], Space) except TypeError: warn("Not a Space instance in file: %s" % space_files[0]) return space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr]) for sim_measure in sim_measures: print("Computing similarities: %s" % sim_measure) if not sim_measure in sim_dict: warn("Similarity measure:%s not defined" % sim_measure) continue sim = sim_dict[sim_measure] out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) with open(in_file) as in_stream, open(out_file, "w") as out_stream: for line in in_stream: if not line.strip() == "": elems = line.strip().split() word1 = elems[col0] word2 = elems[col1] predicted_sim = space.get_sim(word1, word2, sim, space2) out_stream.write("%s %s\n" % (line.strip(), str(predicted_sim)))
def test_as_conversion_tool(self): bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat3.sm", cols=self.dir_ + "mat3.cols", format="sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "sm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", rows=self.dir_ + "CORE_SS.mat3.rows", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) self._test_equal_spaces_dense(s1, s2) s3.to_dense() self._test_equal_spaces_dense(s1, s3) bcs.main([ "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_, "--input_format", "dm", "--output_format", "dm" ]) s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm", cols=self.dir_ + "CORE_SS.mat3.cols", format="dm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space) s3.to_dense() self._test_equal_spaces_dense(s1, s3)
def test_simple_sparse_zipped(self): bcs.main(["build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm", "--gz", "True" ]) s1 = Space.build(data=self.dir_ + "mat1.sm.gz", cols= self.dir_ + "mat1.cols", format = "sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm", cols=self.dir_ + "CORE_SS.mat1.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space) s4 = Space.build(data=self.dir_ + "mat1.sm", cols= self.dir_ + "mat1.cols", format = "sm") self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3) self._test_equal_spaces_sparse(s1, s4)
def main(): pairs_file = sys.argv[1] model_id = sys.argv[2] space_id = sys.argv[3] results_dir = sys.argv[4] pairs_df = pd.read_csv(pairs_file, sep=' ') space_file = { 'cbow-w2': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w2.vsm.pkl', 'cbow-w5': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w5.vsm.pkl', 'cbow-w10': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w10.vsm.pkl', 'ppmi': 'count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl' } space = io_utils.load(data_path + space_file[space_id]).apply(RowNormalization(criterion='length')) models = { 'baseline-' + space: BaselineModel(space), 'add' + space: AdditiveModel(space), 'lexfun' + space: LexfunModel(space, learner='Ridge') } split = [0.5, 0.3, 0.2] partitioned_pairs_df = partition_pairs(pairs_df, split, random_state=42) df = evaluate(partitioned_pairs_df, {model_id: models[model_id]}, verbose=False) df.to_pickle(results_dir + model_id + '-' + space_id + '.pkl') writer = pd.ExcelWriter(results_dir + model_id + '-' + space_id + '.xlsx') df.to_excel(writer, space) writer.save()
def load_pkl_files(dsm_prefix): """ Load the space from either a single pkl file or numerous files. :param dsm_prefix: :param dsm: """ # Check whether there is a single pickle file for the Space object if os.path.isfile(dsm_prefix + '.pkl'): return io_utils.load(dsm_prefix + '.pkl') # Load the multiple files: npz for the matrix and pkl for the other data members of Space with np.load(dsm_prefix + 'cooc.npz') as loader: coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) cooccurrence_matrix = SparseMatrix(csr_matrix(coo)) with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in: row2id = pickle.load(f_in) with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in: id2row = pickle.load(f_in) with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in: column2id = pickle.load(f_in) with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in: id2column = pickle.load(f_in) return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
def test(): #syntactic_tree1 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N guitar-n))") #syntactic_tree2 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N instrument-n))") xml_string1 = ''' <ccg> <rule type="fa" cat="S[dcl]\NP"> <lf start="1" span="1" word="play-v" lemma="play" pos="VBZ" chunk="I-VP" entity="O" cat="(S[dcl]\NP)/NP" /> <rule type="lex" cat="NP"> <lf start="2" span="1" word="guitar-n" lemma="guitar" pos="NN" chunk="I-NP" entity="O" cat="N" /> </rule> </rule> </ccg>''' xml_string2 = ''' <ccg> <rule type="fa" cat="S[dcl]\NP"> <lf start="1" span="1" word="play-v" lemma="play" pos="VBZ" chunk="I-VP" entity="O" cat="(S[dcl]\NP)/NP" /> <rule type="lex" cat="NP"> <lf start="2" span="1" word="instrument-n" lemma="instrument" pos="NN" chunk="I-NP" entity="O" cat="N" /> </rule> </rule> </ccg>''' syntactic_tree1 = SyntacticTree.parse_tree_from_xml_string(xml_string1) syntactic_tree2 = SyntacticTree.parse_tree_from_xml_string(xml_string2) lexical_space = io_utils.load("/home/thenghiapham/work/project/tree_kernel/spaces/lexical_ppmi_svd300.pkl") kernel = SemanticTreeKernel(1.0) composition_model = WeightedAdditive() semantic_tree1 = syntactic_tree_2_semantic_tree(syntactic_tree1, lexical_space, composition_model) semantic_tree2 = syntactic_tree_2_semantic_tree(syntactic_tree2, lexical_space, composition_model) print semantic_tree1 print semantic_tree2 print [node._label for node in semantic_tree1.get_nodes()]
def main(): data_path = "/data/dsm/sdewac/" model = sys.argv[1] pairs = sys.argv[2] pattern_set = sys.argv[3] results_dir = sys.argv[4] pairs_df = pd.read_csv(pairs, sep=" ") model_file = { "cbow-w2": "cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w2.vsm.pkl", "cbow-w5": "cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w5.vsm.pkl", "cbow-w10": "cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w10.vsm.pkl", "ppmi": "count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl", } space = io_utils.load(data_path + model_file[model]) space = space.apply(RowNormalization(criterion="length")) patterns = pd.unique(pairs_df["pattern"]) writer = pd.ExcelWriter(results_dir + "/eval-" + model + "-" + pattern_set + ".xlsx") for pattern in patterns: df = eval_pattern(space, pairs_df, pattern, folds=10, random_state=42, verbose=True) df.to_excel(writer, pattern) writer.save()
def build_raw_per_space(in_file_prefix, in_format, is_gz): if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) data_file = "%s.%s" % (in_file_prefix, in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = "%s.gz" % data_file row_file = "%s.rows" % (in_file_prefix) column_file = "%s.cols" % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print "Building matrix..." space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) return space
def build_raw_per_space(in_file_prefix, in_format, is_gz): if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) data_file = '%s.%s' % (in_file_prefix, in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print("Building matrix...") space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) return space
def compute_sim(in_file, columns, out_dir, sim_measures, space_files): sim_dict = {"cos": CosSimilarity(), "lin": LinSimilarity(), "dot_prod": DotProdSimilarity(), "euclidean": EuclideanSimilarity()} if not len(columns) == 2: raise ValueError("Column description unrecognized!") col0 = int(columns[0]) - 1 col1 = int(columns[1]) - 1 try: space = io_utils.load(space_files[0], Space) except TypeError: warn("Not a Space instance in file: %s" % space_files[0]) return space2 = None space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1]) if len(space_files) == 2: space2 = io_utils.load(space_files[1], Space) space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1]) descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr]) for sim_measure in sim_measures: print "Computing similarities: %s" % sim_measure if not sim_measure in sim_dict: warn("Similarity measure:%s not defined" % sim_measure) continue sim = sim_dict[sim_measure] out_file = '%s/%s.%s' % (out_dir, descr, sim_measure) io_utils.create_parent_directories(out_file) with open(in_file) as in_stream, open(out_file,"w") as out_stream: for line in in_stream: if not line.strip() == "": elems = line.strip().split() word1 = elems[col0] word2 = elems[col1] predicted_sim = space.get_sim(word1, word2, sim, space2) out_stream.write("%s %s\n" % (line.strip(), str(predicted_sim)))
def test(): print "hello" syntactic_tree1 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N guitar-n))") syntactic_tree2 = SyntacticTree.read_tree("VP (VBZ play-v) (NP (N instrument-n))") lexical_space = io_utils.load("/home/thenghiapham/work/project/tree_kernel/spaces/lexical_ppmi_svd300.pkl") kernel = SemanticSyntacticTreeKernel(1.0, lexical_space) print syntactic_tree1 print syntactic_tree2 print [node._label for node in syntactic_tree1.get_nodes()]
def main(): partitioned_pairs_file = sys.argv[1] patterns_file = sys.argv[2] model_id = sys.argv[3] space_id = sys.argv[4] pattern_map_file = sys.argv[5] results_file = sys.argv[6] partitioned_pairs_df = pd.read_csv(partitioned_pairs_file, index_col=0) space_file = { 'cbow-w2': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w2.vsm.pkl', 'cbow-w5': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w5.vsm.pkl', 'cbow-w10': 'cbow/cbow_300dim_hs0/sdewac.300.cbow.hs0.w10.vsm.pkl', 'ppmi': 'count-based/sdewac_2015-11-23/sdewac-mst.prepro.bow-c10k-w5.ppmi.matrix.pkl' } space = io_utils.load(data_path + space_file[space_id]).apply(RowNormalization(criterion='length')) models = { 'baseline' : BaselineModel(space), 'add' : AdditiveModel(space), 'lexfun' : LexfunModel(space, learner='Ridge'), 'wadd' : WeightedAdditiveModel(space), 'mul': MultiplicativeModel(space), 'waddx': WeightedAdditiveModel(space, no_diff=True) } model = models[model_id] if patterns_file == 'None': patterns = None else: patterns = [] with open(patterns_file) as f: for l in f.read().splitlines(): patterns += l.split(' ') if pattern_map_file == 'None': pattern_map = {} else: pattern_map = {} with open(pattern_map_file) as f: for l in f.read().splitlines(): xs = l.split(' ') superpattern = xs[0] for p in xs[1:]: pattern_map[p] = superpattern df = prediction_features(partitioned_pairs_df, model, patterns, verbose=False, pattern_map=pattern_map) df.to_pickle(results_file + '.pkl') df.to_csv(results_file + '.csv')
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file): in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1] core_space = io_utils.load(core_space_file, Space) core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1]) space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id) print "Printing..." out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr) io_utils.save(space, out_file_prefix + ".pkl") if not out_format is None: space.export(out_file_prefix, format=out_format)
def transform_raw_per_space(raw_per_space, in_file_prefix, out_dir, out_format, core_space_file): in_file_descr = "PER_SS." + in_file_prefix.split("/")[-1] core_space = io_utils.load(core_space_file, Space) core_descr = ".".join(core_space_file.split("/")[-1].split(".")[0:-1]) space = PeripheralSpace(core_space, raw_per_space.cooccurrence_matrix, raw_per_space.id2row, raw_per_space.row2id) print("Printing...") out_file_prefix = "%s/%s.%s" % (out_dir, in_file_descr, core_descr) io_utils.save(space, out_file_prefix + ".pkl") if not out_format is None: space.export(out_file_prefix, format=out_format)
def Load_Semantic_Space(space_file): # Load the semantic space global my_space; my_space = io_utils.load(space_file); #Normalize the space # my_space = my_space.apply(RowNormalization()) global space_dim; space_dim = my_space.element_shape[0]; # Get the rows in space keyset = set([]); for x in my_space.get_row2id(): keyset.add(x); global key_set; key_set=keyset;
def load_context_vocab(context_filename, spaces_dir): logging.info('Using {0} contents as context words to build a comparable' ' space'.format(context_filename)) if not os.path.isfile(context_filename): logging.info('{0} not found: building...'.format(context_filename)) space_filenames = glob.glob(os.path.join(spaces_dir, '*.pkl')) words = [] for space_filename in space_filenames: sp = io_utils.load(space_filename) words.append(set(sp.id2row)) context_words = set.intersection(*words) with open(context_filename, 'w') as f: for w in context_words: f.write('{0}\n'.format(w)) logging.info('File {0} created'.format(context_filename)) return [l.strip() for l in file(context_filename)]
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, selections, reductions, normalizations, is_gz): in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1] data_file = '%s.%s' % (in_file_prefix, in_format) if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print("Building matrix...") space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) for w in weightings: w_space = apply_weighting(space, w) for s in selections: s_space = apply_selection(w_space, s) for r in reductions: r_space = apply_reduction(s_space, r) for n in normalizations: n_space = apply_normalization(r_space, n) print("Printing...") print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
def test_simple_sparse(self): bcs.main([ "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm", "--output_format", "sm" ]) s1 = Space.build(data=self.dir_ + "mat1.sm", cols=self.dir_ + "mat1.cols", format="sm") s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm", cols=self.dir_ + "CORE_SS.mat1.cols", format="sm") s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space) self._test_equal_spaces_sparse(s1, s2) self._test_equal_spaces_sparse(s1, s3)
def load_pkl_files(dsm_prefix): """ Load the space from either a single pkl file or numerous files. :param dsm_prefix: the prefix of the input files (.pkl, .rows, .cols) """ # Check whether there is a single pickle file for the Space object if os.path.isfile(dsm_prefix + '.pkl'): return io_utils.load(dsm_prefix + '.pkl') # Load the multiple files: npz for the matrix and pkl for the other data members of Space if os.path.isfile(dsm_prefix + '.npz'): with np.load(dsm_prefix + '.npz') as loader: coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape']) cooccurrence_matrix = SparseMatrix(csr_matrix(coo)) with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in: row2id = pickle.load(f_in) with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in: id2row = pickle.load(f_in) with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in: column2id = pickle.load(f_in) with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in: id2column = pickle.load(f_in) return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id) if os.path.isfile(dsm_prefix + '.tsv'): values = np.loadtxt(dsm_prefix + '.tsv', dtype=float, delimiter='\t', skiprows=0, comments=None, encoding='utf-8') targets = np.loadtxt(dsm_prefix + '.rows', dtype=str, skiprows=0, comments=None, encoding='utf-8') # Convert to space in sparse matrix format return Space(SparseMatrix(values), list(targets), []) # If everything fails try to load it as single w2v file space_array = np.loadtxt(dsm_prefix + '.w2v', dtype=object, delimiter=' ', skiprows=1, comments=None, encoding='utf-8') targets = space_array[:,0].flatten() values = space_array[:,1:].astype(np.float) # Convert to space and sparse matrix format return Space(SparseMatrix(values), list(targets), [])
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, selections, reductions, normalizations, is_gz): in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1] data_file = '%s.%s' % (in_file_prefix, in_format) if not in_format in ("sm", "dm", "pkl"): raise ValueError("Invalid input format:%s" % in_format) if in_format == "pkl": space = io_utils.load(data_file, Space) else: if is_gz: data_file = '%s.gz' % data_file row_file = '%s.rows' % (in_file_prefix) column_file = '%s.cols' % (in_file_prefix) if not os.path.exists(row_file): row_file = None if not os.path.exists(column_file): if in_format == "sm": raise ValueError("Column file: %s needs to be provided!" % column_file) column_file = None print "Building matrix..." space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format) for w in weightings: w_space = apply_weighting(space, w) for s in selections: s_space = apply_selection(w_space, s) for r in reductions: r_space = apply_reduction(s_space, r) for n in normalizations: n_space = apply_normalization(r_space, n) print "Printing..." print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
def main(): ap = argparse.ArgumentParser() ap.add_argument('spaces_dir') ap.add_argument('words_list_dir') args = ap.parse_args() spaces_dir = args.spaces_dir words_list_dir = args.words_list_dir # '/mnt/8tera/shareclic/lucaNgrams/5grams/ITA_5grams/matrices/pkl_matrices/' #space_filename = '../spaces/cbow1_wind5_hs0_neg10_size400_smpl1e-05.pkl' output_dir = os.path.join('output', os.path.basename(words_list_dir)) mkdir_p(output_dir) all_words = set(l.strip() for words_filename in glob.glob(os.path.join(words_list_dir, '*')) for l in file(words_filename)) for words_filename in glob.glob(os.path.join(words_list_dir, '*')): space_filename = os.path.join(spaces_dir, os.path.splitext(os.path.basename(words_filename))[0] + '.pkl') if not os.path.isfile(space_filename): logging.error('{0} not found: ignoring'.format(space_filename )) continue context_filename = hashlib.md5(spaces_dir).hexdigest() + '.txt' context_words = load_context_vocab(context_filename, spaces_dir) logging.debug('Processing {0}'.format(space_filename)) sp = io_utils.load(space_filename) #words = [l.strip() for l in file(words_filename)] filtered_words = [w for w in all_words if w in sp.row2id] words_vectors = sp.get_rows(filtered_words) context_vectors = sp.get_rows(context_words) m = words_vectors * context_vectors.transpose() sp2 = Space(m, filtered_words, context_words) io_utils.save(sp2, os.path.join(output_dir,os.path.basename(space_filename)))
def main(): ap = argparse.ArgumentParser() ap.add_argument('--words', nargs='*') ap.add_argument('--spaces', nargs='*') ap.add_argument('-m', '--min-occurrences', type=int, default=1) args = ap.parse_args() words = set(l.strip() for words_filename in args.words for l in file(words_filename)) word_reps = defaultdict(lambda: 0) for sp_filename in args.spaces: logging.info('Counting words in {0}'.format(sp_filename)) sp = io_utils.load(sp_filename) for w in words: if w in sp.row2id: word_reps[w] += 1 for filename in args.words: file_words = [l.strip() for l in file(filename)] with open(filename, 'w') as f: for w in file_words: if word_reps[w] >= args.min_occurrences: f.write('{0}\n'.format(w))
def run_many_clusters(algo): verbs_filename = 'verbs3.txt' filelist = ['output_1000BCto500BC_vocab10000_window1_withoutprep', 'output_499BCto250BC_vocab10000_window1_withoutprep', 'output_249BCto0AD_vocab10000_window1_withoutprep', 'output_1ADto250AD_vocab10000_window1_withoutprep', 'output_251ADto500AD_vocab10000_window1_withoutprep' ] if 'kmean' in algo: f_log = io.open('clusters/kmeans/generallog.csv','w',encoding='utf8') else: f_log = io.open('clusters/log2.csv', 'w', encoding='utf8') f_log.write('n_verb_clusters, n_total_clusters, n_verbs, min_cluster_size, min_samples, metric, dimensions, n_words, n_plot\n'.decode('utf8')) for filebase in filelist: pickles = [f for f in os.listdir(filebase) if f.endswith('.pkl')] if(len(pickles) == 0): print 'No pickles found in directory ' + FLAGS.filebase + '!' exit() pickle = pickles[0] fname = filebase + '/' + pickle sp = io_utils.load(fname) f_verbs = io.open(verbs_filename, 'r', encoding='utf8') verbs = list(f_verbs) f_verbs.close() verbs_bytes = [] for verb in verbs: verb_byte = verb.strip().encode('utf8') if verb_byte in sp.id2row: verbs_bytes.append(verb_byte) print str(len(verbs_bytes)) + ' verbs in main list\n' verb_ids = [sp.row2id[verb] for verb in verbs_bytes] n_plot = 400 if 'kmean' in algo: for n_words in xrange(3000,3001,1000): for dimensions in xrange(2,3): for n_clusters in xrange(8,9): start_time = time.time() [clusterer, mat, clusterids, verb_clusters, ids_for_shortened_mat, cluster_words_plot, cluster_words_all] = cluster_and_plot(sp, 'clusters/kmeans/' + verbs_filename[:-4] + filebase[7:-31] + 'Rectangular25', 'kmean', n_clusters, dimensions, n_words, verb_ids, n_plot, 0, 0, 0) n_verbs = len(verb_clusters) n_verb_clusters = len(set(verb_clusters)) n_total_clusters = len(set(clusterer.labels_)) - 1 print str(n_verbs) + ' verbs specified, split into ' + str(n_verb_clusters) + ' clusters.' out_list_str = ['{:4d}'.format(n_verb_clusters), '{:4d}'.format(n_total_clusters), '{:3d}'.format(n_verbs), '{:3d}'.format(dimensions), '{:5d}'.format(n_words), '{:4d}'.format(n_plot)] out_str = ','.join(out_list_str) f_log.write(out_str.decode('utf8')+'\n') end_time = time.time() print 'Clustering took {:.2f} s'.format(end_time - start_time) else: for n_words in xrange(2000,4001,1000): for dimensions in xrange(2,8): for min_cluster_size in xrange(15,30): for min_samples in xrange(1,15): for metric in xrange(0,1): start_time = time.time() [clusterer, mat, clusterids, verb_clusters, ids_for_shortened_mat] = cluster_and_plot(sp, dimensions, n_words, verb_ids, n_plot, min_cluster_size, min_samples, metric ) n_verbs = len(verb_clusters) n_verb_clusters = len(set(verb_clusters)) - 1 n_total_clusters = len(set(clusterer.labels_)) - 1 print str(n_verbs) + ' verbs specified, split into ' + str(n_verb_clusters) + ' clusters.' out_list_str = ['{:4d}'.format(n_verb_clusters), '{:4d}'.format(n_total_clusters), '{:3d}'.format(n_verbs), '{:3d}'.format(min_cluster_size), '{:3d}'.format(min_samples), '{:2d}'.format(metric), '{:3d}'.format(dimensions), '{:5d}'.format(n_words), '{:4d}'.format(n_plot)] out_str = ','.join(out_list_str) f_log.write(out_str.decode('utf8')+'\n') end_time = time.time() print 'Clustering took {:.2f} s'.format(end_time - start_time) f_log.close()
FLAGS, unparsed = parser.parse_known_args() FLAGS.filebase = 'output_499BCto250BC_vocab10000_window1_withoutprep' FLAGS.verbs_filename = 'verbs_fewer_compounds.txt' pickles = [f for f in os.listdir(FLAGS.filebase) if f.endswith('.pkl')] if(len(pickles) == 0): print 'No pickles found in directory ' + FLAGS.filebase + '!' exit() pickle = pickles[0] fname = FLAGS.filebase + '/' + pickle sp = io_utils.load(fname) f_verbs = io.open(FLAGS.verbs_filename, 'r', encoding='utf8') verbs = list(f_verbs) f_verbs.close() verbs_bytes = [] for verb in verbs: verb_byte = verb.strip().encode('utf8') if verb_byte in sp.id2row: verbs_bytes.append(verb_byte) print str(len(verbs_bytes)) + ' verbs in main list\n' verb_ids = [sp.row2id[verb] for verb in verbs_bytes]
#ex07.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load two spaces my_space = io_utils.load("./data/out/ex01.pkl") my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") print my_space.id2row print my_per_space.id2row #compute similarity between a word and a phrase in the two spaces print my_space.get_sim("car", "sports_car", CosSimilarity(), space2 = my_per_space)
# # FLAGS.filebase = 'output_2000BCto2000AD_vocab20000_window5' # FLAGS.verbs_filename = 'verbs1.txt' # FLAGS.min_similarity = 0.7 # FLAGS.number_neighbours = 30 # FLAGS.number_mean_neighbours = 500 pickles = [f for f in os.listdir(FLAGS.filebase) if f.endswith('.pkl')] if (len(pickles) == 0): print 'No pickles found in directory ' + FLAGS.filebase + '!' exit() pickle = pickles[0] fname = FLAGS.filebase + '/' + pickle this_space = io_utils.load(fname) f_verbs = io.open(FLAGS.verbs_filename, 'r', encoding='utf8') verbs = list(f_verbs) f_verbs.close() f_verbs_for_mean = io.open(FLAGS.verbs_for_mean_filename, 'r', encoding='utf8') verbs_for_mean = list(f_verbs_for_mean) f_verbs_for_mean.close() verbs_bytes = [] for verb in verbs: verb_byte = verb.strip().encode('utf8') if verb_byte in this_space.id2row:
#ex12.py #------- from composes.utils import io_utils #load a previously saved weighted additive model my_comp = io_utils.load("./data/out/model01.pkl") #print its parameters print "alpha:", my_comp.alpha print "beta:", my_comp.beta #load two spaces my_space = io_utils.load("./data/out/ex10.pkl") my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") #apply the composition model to them composed_space = my_comp.compose([("good", "history_book", "good_history_book")], (my_space, my_per_space)) print composed_space.id2row print composed_space.cooccurrence_matrix
#ex13.py #------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive #training data train_data = [("good", "car", "good_car"), ("good", "book", "good_book")] #load an argument space arg_space = io_utils.load("./data/out/ex10.pkl") print arg_space.id2row print arg_space.cooccurrence_matrix #load a phrase space phrase_space = io_utils.load("data/out/PHRASE_SS.ex10.pkl") print phrase_space.id2row print phrase_space.cooccurrence_matrix #train a weighted additive model on the data my_comp = WeightedAdditive() my_comp.train(train_data, arg_space, phrase_space) #print its parameters print "alpha:", my_comp.alpha print "beta:", my_comp.beta
#ex09.py #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity #load two spaces my_space = io_utils.load("./data/out/ex01.pkl") my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") print(my_space.id2row) print(my_space.cooccurrence_matrix) print(my_per_space.id2row) print(my_per_space.cooccurrence_matrix) #get the top two neighbours of "car" in a peripheral space print(my_space.get_neighbours("car", 2, CosSimilarity(), space2=my_per_space))
from __future__ import print_function from composes.utils import io_utils gastrovec = io_utils.load("gastrovec.ppmi.svd20.pkl") gastrovec.export(file_prefix="fullexport", format="dm") ''' with open("export3.csv","w") as f: # f.write("INGREDIENT " + " ".join(gastrovec.id2column) + "\n") with open("export.dm") as f_in: for line in f_in: f.write(line) '''
#ex20.py #------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity #read in a space my_space = io_utils.load("data/out/ex01.pkl") #compute similarities of a list of word pairs fname = "data/in/word_sims.txt" word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) predicted = my_space.get_sims(word_pairs, CosSimilarity()) #compute correlations gold = io_utils.read_list(fname, field=2) print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")
import sys import os folder = os.path.expandvars('/home/luka/Downloads/dissect-master/src') if folder not in sys.path: sys.path.append(folder) from composes.semantic_space.space import Space my_space = Space.build( data="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.sm", rows="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.rows", cols="/home/luka/Downloads/dissect-master/src/examples/data/in/ex01.cols", format="sm") from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting my_space = io_utils.load( "/home/luka/Downloads/dissect-master/src/examples/data/out/ex01.pkl") print my_space.cooccurrence_matrix my_space = my_space.apply(PpmiWeighting()) print my_space.cooccurrence_matrix
def main(): ap = argparse.ArgumentParser() ap.add_argument('--export-only', action='store_true', default=False) ap.add_argument('space_dir', help='Directory where the DISSECT spaces are ' 'located') ap.add_argument('spaces_order', help='Order in time of the spaces (no ' 'relevant effect when exporting)') ap.add_argument('target_word', help='This is the word that we want to ' 'highlight in the animation (no effect when exporting)') args = ap.parse_args() center_word = args.target_word #'cane' space_dir = args.space_dir # get the spaces filenames space_filenames = [os.path.join(space_dir, os.path.basename(l.strip())) for l in file(args.spaces_order)] def guess_year(space_filename): try: basename = os.path.basename(space_filename) year = basename.split("_")[1] if len(year) == 3: return "1" + year else: return year except: #We don't want to take any chances with this feature: if it doesn't #work, tough luck return "" # guess the years years = map(guess_year, space_filenames) # load the spaces spaces = map(lambda f: io_utils.load(f), space_filenames) # put together all the spaces adding the year to each of the words # to avoid repetitions (the words are unique) stacked = None for sp,space_filename in zip(spaces, space_filenames): stacked = vstack(stacked, add_year(sp, os.path.basename(space_filename))) # Find a mapping to 2D (in this case we are finding the mapping to 2D by # actually finding the 2D coordinates of the vectors, but one could # find such a mapping by other means, and then apply it to get the # vector coordinates) stacked = stacked.apply(Svd(2)) # Apply the mapping (given by the stacked space) to obtain the 2D vectors. # As explained below, now this is redundant, but it does not necessarily # need to be the case transformed_spaces = [PeripheralSpace(stacked, sp.cooccurrence_matrix, sp.id2row, sp.row2id) for sp in spaces] if args.export_only: #print the coordinates print ",".join(["year,word,x,y"]) for year, sp in zip(years, transformed_spaces): for w in sp.id2row: v = sp.get_row(w).mat print ",".join([year,w,str(v[0,0]), str(v[0,1])]) else: #produce animation anim = AnimatedScatter(center_word,years, transformed_spaces, stacked, scale_factor=40) mkdir_p('output') anim.save('output/{0}.mp4'.format(center_word))
def main(): global input_is_tokenized, use_lemmatization, space_cols_file, \ loaded_space_file_s, loaded_space_file_t, source_lang, \ target_lang, input_file, output_file, tag_cutoff, \ no_stopword_print, number_of_translations, \ number_of_neighbours, different_pos_punishment, \ treetagger_path parser = argparse.ArgumentParser(description="Word translations" + \ " that fit best to the sentence") parser.add_argument("-k", "--tokenized", help="use pretokenized input", action="store_true") parser.add_argument("-l", "--lemmatized", help="use lemmatization", action="store_true") parser.add_argument("-p", "--returntag", help="return language tag", action="store_true") parser.add_argument("-d", "--dimensions", type=str, help="column file for the input matrix") parser.add_argument("-m", "--sourcematrix", type=str, help="pickled input matrix for source language") parser.add_argument("-y", "--targetmatrix", type=str, help="pickled input matrix for target language") parser.add_argument("-s", "--sourcelang", type=str, help="input language") parser.add_argument("-t", "--targetlang", type=str, help="output language") parser.add_argument("-i", "--infile", type=str, help="input file") parser.add_argument("-o", "--outfile", type=str, help="output file") parser.add_argument("-nsp", "--no-stopword-print", action="store_true", help="Omit to print words without candidates -- usually " + \ "stop words.") parser.add_argument("-nt", "--number-of-translations", type=float, help="The number of candidates to show for each input word.") parser.add_argument("-nn", "--number-of-neighbours", type=int, help="The number of neighbours for each input word to " + \ "consider in the similarity space constructed.") parser.add_argument("-dpp", "--different-pos-punishment", type=float, help="The score's fraction to punish a " + \ "candidate word which is there, but " + \ "has not the same POS as its input peer.") parser.add_argument args = parser.parse_args() if args.sourcelang: source_lang = args.sourcelang if args.targetlang: target_lang = args.targetlang if args.tokenized: input_is_tokenized = True if args.lemmatized: use_lemmatization = True if args.dimensions: space_cols_file = args.dimensions elif source_lang == target_lang: space_cols_file = DATA_DIR_OUT + source_lang + '-words.col' else: space_cols_file = DATA_DIR_OUT \ + '_'.join(sorted([source_lang,target_lang])) \ + '-words.col' if args.sourcematrix: loaded_space_file_s = args.sourcematrix elif source_lang == target_lang: loaded_space_file_s = DATA_DIR_OUT + source_lang + '.pkl' else: loaded_space_file_s = DATA_DIR_OUT + source_lang \ + '_' + source_lang + '-' + target_lang \ + '.pkl' if args.targetmatrix: loaded_space_file_t = args.targetmatrix elif source_lang == target_lang and loaded_space_file_t == "": loaded_space_file_t = DATA_DIR_OUT + target_lang + '.pkl' else: loaded_space_file_t = DATA_DIR_OUT + target_lang \ + '_' + target_lang + '-' + source_lang \ + '.pkl' if args.infile: input_file = open(args.infile, "r") if args.outfile: output_file = open(args.outfile, "w") if args.returntag: tag_cutoff = 0 else: if args.lemmatized: tag_cutoff = 5 else: tag_cutoff = 3 if args.no_stopword_print: no_stopword_print = args.no_stopword_print # vector dimension/columns for input matrix and matrix per sentence space_cols_fileobject = open(space_cols_file, "r") # space_cols = space_cols_fileobject.readlines() space_cols = space_cols_fileobject.read().split("\n")[:-1] space_cols_fileobject.close() # load the space loaded_space = {} loaded_space[source_lang] = io_utils.load(loaded_space_file_s) # only load it once for similary queries in the same language if not loaded_space.get(target_lang): loaded_space[target_lang] = io_utils.load(loaded_space_file_t) # Initialize TreeTagger only once (for later use) treetagger = TreeTagger(TAGLANG=source_lang, TAGDIR=treetagger_path, TAGINENC=ENC, TAGOUTENC=ENC) # work on input file while True: line = input_file.readline() words = [] # words in sentence lemmas = [] # lemmas in sentence pos = [] # part-of-speech tags per word in sentence formatted = [] # matrix for sentence freq = defaultdict(lambda: defaultdict(int)) # Stop when file is entirely read if not line: break # For pre-treetagged text if input_is_tokenized: while not re.match(r'[.:?!]', line): t = line.rstrip() w = t.split("\t")[0] p = helpers.getTag(t.split("\t")[1], source_lang) l = t.split("\t")[2] words.append(w) lemmas.append(l) pos.append(p) formatted.append(helpers.dimensionformat(w, p, l, source_lang, use_lemmatization)) line = input_file.readline() if not line: break # Use tree-tagger as lemmatizer and/or tokenizer else: treetagger_sentence = treetagger.TagText(line) for t in treetagger_sentence: try: w = t.split("\t")[0] p = helpers.getTag(t.split("\t")[1], source_lang) l = t.split("\t")[2] except: print >> sys.stderr, \ "Caution: TreeTagger token cannot " + \ "be processed:", t continue # Skip it words.append(w) lemmas.append(l) pos.append(p) formatted.append(helpers.dimensionformat(w, p, l, source_lang, use_lemmatization)) # fill matrix for sentence for i in formatted: for j in formatted: freq[i][j] += 1 # bild unique list of the words in this sentence for the rows uniqwords = set() for l in formatted: uniqwords.add(l) query_rows = list(uniqwords) # rows for sentence matrix # dissect compatible matrix m = np.mat(np.zeros(shape=(len(query_rows), len(space_cols)))) # convert sentence matrix to compatible matrix for i in range(len(query_rows)): for j in range(len(space_cols)): m[i, j] = freq[query_rows[i]][space_cols[j]] # build dissect matrix query_space = Space(DenseMatrix(m), query_rows, space_cols) # for every word print neighbours with similarity for i in range(len(words)): best_translations = get_best_translations(words[i], pos[i], lemmas[i], query_space, loaded_space) output_file.write(format_best_translations(words[i], pos[i], lemmas[i], best_translations)) if input_is_tokenized: output_file.write(line.split("\t")[0] + "\n") if args.infile: input_file.close() if args.outfile: output_file.close()
from composes.composition.lexical_function import LexicalFunction from composes.composition.full_additive import FullAdditive from composes.composition.weighted_additive import WeightedAdditive from composes.composition.multiplicative import Multiplicative from composes.composition.dilation import Dilation from composes.utils.regression_learner import RidgeRegressionLearner import composes.utils.io_utils as io_utils import composes.utils.scoring_utils as scoring_utils #load a core space print "Loading the data..." data_path = "/mnt/cimec-storage-sata/users/thenghia.pham/shared/tutorial/" space_file = data_path + "CORE_SS.verbnoun.core.pkl" space = io_utils.load(space_file) print "Applying PPMI..." space = space.apply(PpmiWeighting()) print "Applying feature selection..." space = space.apply(TopFeatureSelection(2000)) print "Applying SVD..." space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data=data_path + "per.raw.SV.sm", cols=data_path + "per.raw.SV.cols", format="sm")
help='Number of dimensions used for SVD') parser.add_argument( '--normalisation', type=str, default = '0', help = 'Type of normalisation' ) FLAGS, unparsed = parser.parse_known_args() #This needs pointing to the location that DISSECT is installed to. build_core_str = 'python C:/Users/Rachel/Documents/dissect-master/dissect-master/src/pipelines/build_core_space.py ' build_core_str += ' -i ' + FLAGS.filebase + '/sparsematrix' build_core_str += ' --input_format sm --w ppmi -r svd_' + str(FLAGS.nSVD) + ' -o ' + FLAGS.filebase if FLAGS.normalisation != '0': build_core_str += ' -n all' print build_core_str os.system(build_core_str) saved_space_filename = FLAGS.filebase + "/CORE_SS.sparsematrix.ppmi.svd_" \ + str(FLAGS.nSVD) if FLAGS.normalisation != '0': saved_space_filename += ".all" saved_space_filename += ".pkl" this_space = io_utils.load(saved_space_filename) plot_space(this_space, FLAGS.nplot, FLAGS.filebase + ".png")
#ex02.py #------- from composes.semantic_space.space import Space from composes.utils import io_utils #create a space from co-occurrence counts in sparse format my_space = Space.build(data="./data/in/ex01.sm", rows="./data/in/ex01.rows", cols="./data/in/ex01.cols", format="sm") #print the co-occurrence matrix of the space print my_space.cooccurrence_matrix #save the Space object in pickle format io_utils.save(my_space, "./data/out/ex01.pkl") #load the saved object my_space2 = io_utils.load("./data/out/ex01.pkl") #print the co-occurrence matrix of the loaded space print my_space2.cooccurrence_matrix
from composes.utils import io_utils from composes.transformation.scaling.ppmi_weighting import PpmiWeighting from composes.transformation.dim_reduction.svd import Svd from composes.composition.lexical_function import LexicalFunction from composes.similarity.cos import CosSimilarity import pickle from composes.utils import scoring_utils import os path = os.getcwd() print("Building space...") # create a space from co-occurrence counts in sparse format try: my_space = io_utils.load("my_space.pkl") except FileNotFoundError: my_space = Space.build(data="./data/in/spacew.sm", rows="./data/in/spacew.rows", cols="./data/in/spacew.cols", format="sm") print("Applying PPMI...") my_space = my_space.apply(PpmiWeighting()) print("Applying SVD...") my_space = my_space.apply(Svd(350)) io_utils.save(my_space, "my_space.pkl") print("Loading pairs...")
#similarity.py #USAGE: python similarity [space file] [word1] [word2] #EXAMPLE: python kneighbours ~/UkWac/dissect/ANs/ANs.kpl car_n dog_n #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity import sys #load a space my_space = io_utils.load(sys.argv[1]) #print my_space.cooccurrence_matrix #print my_space.id2row #compute similarity between two words in the space print "The similarity of", sys.argv[2], "and", sys.argv[ 3], "is:", my_space.get_sim(sys.argv[2], sys.argv[3], CosSimilarity())
#ex05.py #------- from composes.utils import io_utils from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #load a space and apply ppmi on it my_space = io_utils.load("./data/out/ex01.pkl") my_space = my_space.apply(PpmiWeighting()) print my_space.cooccurrence_matrix print my_space.id2row #create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="./data/in/ex05.sm", cols="./data/in/ex05.cols", format="sm") print my_per_space.cooccurrence_matrix print my_per_space.id2row #save the space io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
def test_simple_define(self): #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl") #new_space = trained.function_space #compose with lexical function ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "--load_model", self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl", "-a", self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ]) sp2 = Space.build(data=self.dir_ + "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm", format="dm") #compose with weighted addition ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "weighted_add", "--alpha", "0.5", "--beta", "0.5", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.WeightedAdditive.an_train_data.txt.dm", format="dm") sp3 = io_utils.load( self.dir_ + "COMPOSED_SS.WeightedAdditive.an_train_data.txt.pkl") np.testing.assert_array_equal(sp1.cooccurrence_matrix.mat, np.mat([[3, 4], [4, 5]])) self._test_equal_spaces_structs(sp1, sp2) sp1.to_sparse() sp3.to_sparse() self._test_equal_spaces_sparse(sp1, sp3) #the two output format have to contain identical data sp1.to_dense() sp3.to_dense() self._test_equal_spaces_dense(sp1, sp3) #compose with dilation ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m", "dilation", "--lambda", "1", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ + "CORE_SS.N_mat.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.Dilation.an_train_data.txt.dm", format="dm") n_space = io_utils.load(self.dir_ + "CORE_SS.N_mat.pkl") sp1.to_dense() n_space.to_dense() np.testing.assert_array_almost_equal( sp1.cooccurrence_matrix.mat, n_space.cooccurrence_matrix.mat * 25) self._test_equal_spaces_structs(sp1, sp2) #compose with dilation, change the order of the arguments ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "na_train_data.txt", "-o", self.dir_, "-m", "dilation", "--lambda", "1", "-a", self.dir_ + "CORE_SS.N_mat.pkl" + "," + self.dir_ + "CORE_SS.A_mat.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.Dilation.na_train_data.txt.dm", format="dm") sp1.to_dense() np.testing.assert_array_almost_equal(sp1.cooccurrence_matrix.mat, np.mat([[75, 100], [183, 244]]), 5) self._test_equal_spaces_structs(sp1, sp2) #compose with multiplicative ac.main([ "apply_composition.py", "-l", self.dir_ + "log1.txt", "-i", self.dir_ + "aan_train_data.txt", "-o", self.dir_, "-m", "mult", "-a", self.dir_ + "CORE_SS.A_mat.pkl" + "," + self.dir_ + "COMPOSED_SS.Dilation.an_train_data.txt.pkl", "--output_format", "dm" ]) sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.Multiplicative.aan_train_data.txt.dm", format="dm")
return ss1.wup_similarity(ss2, brown_ic) def lch_sim(ss1,ss2): return ss1.lch_similarity(ss2, brown_ic) def mean(seq): print(sum(seq) / len(seq)) return sum(seq) / len(seq) def is_better(ingredients, result, other): return mean(map(lambda x: sim(x,result),ingredients)) > mean(map(lambda x: sim(x,other),ingredients)) def vs_sim(word1,word2,space): return space.get_sim(word1,word2,CosSimilarity()) def limit(iterator,num): for _ in range(num): yield next(iterator) raise StopIteration gastrovec = io_utils.load("../vector_processing/gastrovec.ppmi.svd20.pkl") wn_scores, vs_scores = [], [] jcn_scores, res_scores, lin_scores, lch_scores, wup_scores = [], [], [], [], [] ingredients = [] with open("../vector_processing/ingredients_in_wordnet") as f: for line in limit(f,int(sys.argv[1])): l = line.strip() ingredients.append(l) for (a,b) in combinations(ingredients,2): a_,b_=getss(a), getss(b) wn_scores.append(wn_sim(a_,b_)) res_scores.append(res_sim(a_,b_))
from __future__ import print_function import sys from random import randint from itertools import count from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive from composes.semantic_space.space import Space stacked_space = io_utils.load("gastrovec.ppmi.svd20.pkl") WA = WeightedAdditive(alpha = 1, beta = 1) recipes = {} max_size = 0 with open("../corpus_collection/composition_counts.txt") as f: for line in f: words = line.split() recipes[words[0]] = words[1:] if len(words)-1 > max_size: max_size = len(words)-1 WA = WeightedAdditive(alpha = 1, beta = 1) last_space = None number = count() for size in xrange(max_size,1,-1): relevant = (rec for rec in recipes if len(recipes[rec]) == size) print(size) composition = [] for recipe in relevant: old = recipes[recipe]
#similarity.py #USAGE: python similarity [space file] [word1] [word2] #EXAMPLE: python kneighbours ~/UkWac/dissect/ANs/ANs.kpl car_n dog_n #------- from composes.utils import io_utils from composes.similarity.cos import CosSimilarity import sys #load a space my_space = io_utils.load(sys.argv[1]) #print my_space.cooccurrence_matrix #print my_space.id2row #compute similarity between two words in the space print "The similarity of",sys.argv[2],"and",sys.argv[3],"is:",my_space.get_sim(sys.argv[2], sys.argv[3], CosSimilarity())
#ex12.py #------- from composes.utils import io_utils #load a previously saved weighted additive model my_comp = io_utils.load("./data/out/model01.pkl") #print its parameters print "alpha:", my_comp.alpha print "beta:", my_comp.beta #load two spaces my_space = io_utils.load("./data/out/ex10.pkl") my_per_space = io_utils.load("./data/out/PER_SS.ex05.pkl") #apply the composition model to them composed_space = my_comp.compose( [("good", "history_book", "good_history_book")], (my_space, my_per_space)) print composed_space.id2row print composed_space.cooccurrence_matrix
#ex10.py #------- from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive #load a space my_space = io_utils.load("./data/out/ex10.pkl") print my_space.id2row print my_space.cooccurrence_matrix # instantiate a weighted additive model my_comp = WeightedAdditive(alpha = 1, beta = 1) # use the model to compose words in my_space composed_space = my_comp.compose([("good", "book", "good_book"), ("good", "car", "good_car")], my_space) print composed_space.id2row print composed_space.cooccurrence_matrix #save the composed space io_utils.save(composed_space, "data/out/PHRASE_SS.ex10.pkl")
# (Paris is to France what ___ is to Germany) ########################################################################## from composes.utils import io_utils from composes.composition.weighted_additive import WeightedAdditive from composes.similarity.cos import CosSimilarity import sys pkl=sys.argv[1] base=sys.argv[2] minus=sys.argv[3] plus=sys.argv[4] space = io_utils.load(pkl) # instantiate an additive and subtractive model add = WeightedAdditive(alpha = 1, beta = 1) sub = WeightedAdditive(alpha = 1, beta = -1) #print space.get_neighbours(base, 10, CosSimilarity()) print "Subtracting",minus,"from",base composed_space = sub.compose([(base, minus, "step1")], space) #print composed_space.get_neighbours("step1", 10, CosSimilarity(),space) print "Adding",plus,"..." composed_space2 = add.compose([("step1", plus, "step2")], (composed_space,space)) print composed_space2.get_neighbours("step2", 10, CosSimilarity(),space)