def build_bigram_space(): bigrams_space = PeripheralSpace.build(unigrams_space, data=args.function[3], cols=args.function[1], format="sm") save_space(bigrams_space, "bigrams_space")
#ex05.py #------- from composes.utils import io_utils from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #load a space and apply ppmi on it my_space = io_utils.load("./data/out/ex01.pkl") my_space = my_space.apply(PpmiWeighting()) print my_space.cooccurrence_matrix print my_space.id2row #create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="./data/in/ex05.sm", cols="./data/in/ex05.cols", format="sm") print my_per_space.cooccurrence_matrix print my_per_space.id2row #save the space io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
space_file = data_path + "CORE_SS.verbnoun.core.pkl" space = io_utils.load(space_file) print "Applying PPMI..." space = space.apply(PpmiWeighting()) print "Applying feature selection..." space = space.apply(TopFeatureSelection(2000)) print "Applying SVD..." space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data=data_path + "per.raw.SV.sm", cols=data_path + "per.raw.SV.cols", format="sm") #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0, 1, 2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2]) composed_space = comp_model.compose(test_phrases, space)
#ex05.py #------- from composes.utils import io_utils from composes.semantic_space.peripheral_space import PeripheralSpace from composes.transformation.scaling.ppmi_weighting import PpmiWeighting #load a space and apply ppmi on it my_space = io_utils.load("./data/out/ex01.pkl") my_space = my_space.apply(PpmiWeighting()) print(my_space.cooccurrence_matrix) print(my_space.id2row) #create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="./data/in/ex05.sm", cols="./data/in/ex05.cols", format="sm") print(my_per_space.cooccurrence_matrix) print(my_per_space.id2row) #save the space io_utils.save(my_per_space, "./data/out/PER_SS.ex05.pkl")
def train_baroni_guevara_composers(all_vectors, ROOT_DIR, baroni_output_path, guevara_output_path, baroni_threshold=10): """ :type all_vectors: str; path to vectors file containing both N and observed AN vectors :type ROOT_DIR: str; where to write temp files :type baroni_output_path: str; where to write pickled baroni composer :type guevara_output_path: str :type baroni_threshold: int """ SVD_DIMS = 100 baroni_training_phrase_types = {'AN', 'NN'} # what kind of NPs to train Baroni composer for # prepare the input files to be fed into Dissect mkdirs_if_not_exists(ROOT_DIR) filename = basename(all_vectors) noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS)) NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS)) thes = Vectors.from_tsv(all_vectors, lowercasing=False) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) thes.to_tsv(NPs_events_file, entry_filter=lambda x: x.type in baroni_training_phrase_types, row_transform=lambda x: str(x).replace(' ', '_')) _translate_byblo_to_dissect(NPs_events_file) my_space = Space.build(data="{}.sm".format(noun_events_file), rows="{}.rows".format(noun_events_file), cols="{}.cols".format(noun_events_file), format="sm") logging.info('Each unigram vector has dimensionality %r', my_space.element_shape) # create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="{}.sm".format(NPs_events_file), rows="{}.rows".format(NPs_events_file), # The columns of the peripheral space have to be identical to those # in the core space (including their order)! cols="{}.cols".format(NPs_events_file), format="sm") logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape) # use the model to compose words in my_space all_data = [] for phrase in my_per_space._row2id: # make sure there are only NPs here if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types: adj, noun = phrase.split('_') all_data.append((adj, noun, '%s_%s' % (adj, noun))) # train a composition model on the data and save it baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner()) guevara = FullAdditive(learner=RidgeRegressionLearner()) for composer, out_path in zip([baroni, guevara], [baroni_output_path, guevara_output_path]): composer.train(all_data, my_space, my_per_space) io_utils.save(composer, out_path) logging.info('Saved trained composer to %s', out_path)
space_file = data_path + "CORE_SS.verbnoun.core.pkl" space = io_utils.load(space_file) print "Applying PPMI..." space = space.apply(PpmiWeighting()) print "Applying feature selection..." space = space.apply(TopFeatureSelection(2000)) print "Applying SVD..." space = space.apply(Svd(100)) print "Creating peripheral space.." per_space = PeripheralSpace.build(space, data = data_path + "per.raw.SV.sm", cols = data_path + "per.raw.SV.cols", format = "sm" ) #reading in train data train_data_file = data_path + "ML08_SV_train.txt" train_data = io_utils.read_tuple_list(train_data_file, fields=[0,1,2]) print "Training Lexical Function composition model..." comp_model = LexicalFunction(learner = RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2]) composed_space = comp_model.compose(test_phrases, space)
sys.stderr.flush() gastrovec = Space.build(data = "../corpus_collection/corpus.sm", rows = "../corpus_collection/corpus.rows", cols = "../corpus_collection/corpus.cols", format = "sm") print("done.", file=sys.stderr) io_utils.save(gastrovec, "gastrovec.pkl") print("Applying PPMI... ",end="", file=sys.stderr) sys.stderr.flush() gastrovec = gastrovec.apply(PpmiWeighting()) print("Applying SVD (20)... ",end="",file=sys.stderr) sys.stderr.flush() gastrovec = gastrovec.apply(Svd(20)) print("done.", file=sys.stderr) io_utils.save(gastrovec, "gastrovec.ppmi.svd20.pkl") print("Loading recipe peripheral space...",end="",file=sys.stderr) sys.stderr.flush() recipes = PeripheralSpace.build(gastrovec, data = "../corpus_collection/recipes.sm", rows = "../corpus_collection/recipes.rows", cols = "../corpus_collection/recipes.cols", format = "sm") print("done.", file=sys.stderr) io_utils.save(recipes, "recipes.ppmi.svd20.pkl")