def build_full_composed_thesauri_with_baroni_and_svd(args): # SET UP A FEW REQUIRED PATHS byblo_opts, _ = parse_byblo_conf_file(args.conf) input_file_name = os.path.basename(byblo_opts.input) # INPUT 1: DIRECTORY. Must contain a single conf file unigram_vectors_dir = os.path.abspath(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir) unigram_vectors_dir_ppmi = '%s-ppmi' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir_ppmi) unigram_vectors_dir_ppmi_svd = '%s-ppmi-svd' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir_ppmi_svd) # INPUT 2: A FILE, TSV, underscore-separated observed vectors for ANs and NNs SVD_DIMS = 100 ngram_vectors_dir = '%s-ppmi-svd-composed' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(ngram_vectors_dir) composer_algos = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer, VerbComposer, RightmostWordComposer] # EXTRACT UNIGRAM VECTORS WITH BYBLO if 'unigrams' in args.stages: calculate_unigram_vectors(os.path.abspath(args.conf), os.path.abspath(args.byblo)) else: logging.warning('Skipping unigrams stage. Assuming output is at %s', byblo_opts.output) # FEATURE REWEIGHTING- will always be performed if 'ppmi' in args.stages: _do_ppmi(_find_events_file(byblo_opts.output), unigram_vectors_dir_ppmi) # REDUCE DIMENSIONALITY # add in observed AN/NN vectors for SVD processing. Reduce both unigram vectors and observed phrase vectors # together and put the output into the same file unreduced_unigram_events_file = _find_events_file(unigram_vectors_dir_ppmi) # ...exp6-12/exp6.events.filtered.strings --> ...exp6-12/exp6 reduced_file_prefix = join(unigram_vectors_dir_ppmi_svd, input_file_name) # only keep the most frequent types per PoS tag to speed things up counts = [('N', 200000), ('V', 200000), ('J', 100000), ('RB', 0), ('AN', 0), ('NN', 0)] if 'svd' in args.stages: # in this case the name exp%d-with-obs-phrases is massively misleading because # there aren't any obs phrase vectors # let's just do SVD on the unigram phrases so we can compose them simply later do_svd(unreduced_unigram_events_file, reduced_file_prefix, desired_counts_per_feature_type=counts, reduce_to=[SVD_DIMS]) else: logging.warning('Skipping SVD stage. Assuming output is at %s-SVD*', reduced_file_prefix) # construct the names of files output by do_svd all_reduced_vectors = '%s-SVD%d.events.filtered.strings' % (reduced_file_prefix, SVD_DIMS) if 'compose' in args.stages: # it is OK for the first parameter to contain phrase vectors, there is explicit filtering coming up # the assumption is these are actually observed phrasal vectors compose_and_write_vectors(all_reduced_vectors, '%s-%s' % (input_file_name, SVD_DIMS), composer_algos, output_dir=ngram_vectors_dir, dense_hd5=True) else: logging.warning('Skipping composition stage. Assuming output is at %s', ngram_vectors_dir)
nouns_wins_wiki = os.path.join(FET, 'exp11-13b-ppmi/exp11-with-obs-phrases-SVD100.events.filtered.strings') nouns_w2v_gigaw_100 = os.path.join(FET, 'word2vec_vectors/word2vec-gigaw-100perc.unigr.strings.rep0') nouns_w2v_wiki_15 = os.path.join(FET, 'word2vec_vectors/word2vec-wiki-15perc.unigr.strings.rep0') nouns_w2v_wiki_100 = os.path.join(FET, 'word2vec_vectors/word2vec-wiki-100perc.unigr.strings.rep0') nouns_glove_wiki_100 = os.path.join(FET, 'glove/vectors.miro.h5') all_nouns = [nouns_wins_wiki, nouns_w2v_gigaw_100, nouns_w2v_wiki_15, nouns_w2v_wiki_100, nouns_glove_wiki_100] names_composed_files = ['wiki-wins-100', 'gigaw-w2v-100', 'wiki-w2v-15', 'wiki-w2v-100', 'wiki-glove-100'] save_files = ['%s-vector-matrices.hdf' % x for x in names_composed_files] for noun_path, save_file, sname in zip(all_nouns, save_files, names_composed_files): trained_verb_matrices_file = os.path.join(VERBS_HDF_DIR, save_file) # the list of subject/objects of a given verb is determined from the unlabelled corpus, # and so are the noun vectors if 'wiki' in sname: svos_path = '/lustre/scratch/inf/mmb28/DiscoUtils/wiki_NPs_in_MR_R2_TechTC_am_maas.uniq.10.txt' elif 'giga' in sname: svos_path = '/lustre/scratch/inf/mmb28/DiscoUtils/gigaw_NPs_in_MR_R2_TechTC_am_maas.uniq.10.txt' else: raise ValueError('What unlabelled corpus is this???') train_verb_tensors(svos_path, noun_path, trained_verb_matrices_file) compose_and_write_vectors(noun_path, # a vector store containing noun vectors sname, # something to identify the source of unigram vectors [CopyObject, FrobeniusAdd, FrobeniusMult], # filename of output of training stage categorical_vector_matrix_file=trained_verb_matrices_file, output_dir=VERBS_HDF_DIR, dense_hd5=True)
def compute_and_write_vectors(corpus_name, stages, percent, repeat, remove_pos): prefix = os.path.abspath(os.path.join(__file__, '..', '..')) output_dir = join(prefix, 'outputs', 'word2vec') mkdirs_if_not_exists(output_dir) # inputs conll_data_dir = join(prefix, 'data/%s-conll' % corpus_name) # outputs if remove_pos: text_only_data_dir = join(prefix, 'data/%s-nopos' % corpus_name) unigram_events_file = join(output_dir, '%s-nopos-%dperc.unigr.strings'%(corpus_name, percent)) else: text_only_data_dir = join(prefix, 'data/%s' % corpus_name) unigram_events_file = join(output_dir, '%s-%dperc.unigr.strings'%(corpus_name, percent)) if percent > 90 and repeat > 1: raise ValueError('Repeating with a different sample of corpus only makes sense when ' 'the samples are sufficiently distinct. This requires that the sample' ' size is fairly small to minimise overlap between samples') if 'reformat' in stages: reformat_data(conll_data_dir, text_only_data_dir, remove_pos) if 'vectors' in stages: models = [_train_model(percent, text_only_data_dir, i, remove_pos) for i in range(repeat)] vectors = [] # write the output of each run separately for i in range(repeat): output_path = unigram_events_file + '.rep%d' % i vectors.append(write_gensim_vectors_to_tsv(models[i], output_path)) if 'average' in stages and repeat > 1: # average vectors and append to list to be written shared_vocab = set.intersection(*[set(model.vocab.keys()) for model in models]) output_path = unigram_events_file + '.avg%d' % repeat model = {} for k in shared_vocab: model[k] = reduce(np.add, [m[k] for m in models]) vectors.append(write_gensim_vectors_to_tsv(model, output_path, vocab=shared_vocab)) else: # let's just pretend something was written above. just need this so the loop below will run vectors = [None] * repeat + ([None] if 'average' in stages and repeat > 1 else []) if 'compose' in stages: for i, v in enumerate(vectors): # if we'll also be composing we don't have to write the unigram vectors to disk # just to read them back later. if 'average' in stages and i == (len(vectors) - 1) and len(vectors) > 1: # last set of vectors in the list, these are the averages ones out_path = 'word2vec-%s_%dpercent-avg%d' % (corpus_name, percent, repeat) input_thing = v if 'vectors' in stages else unigram_events_file + '.avg%d' % repeat else: out_path = 'word2vec-%s_%dpercent-rep%d' % (corpus_name, percent, i) input_thing = v if 'vectors' in stages else unigram_events_file + '.rep%d' % i row_filter = default_row_filter_nopos if remove_pos else default_row_filter compose_and_write_vectors(input_thing, out_path, composer_algos, output_dir=output_dir, row_filter=row_filter, remove_pos=remove_pos, dense_hd5=True)
dv.to_tsv(turian_unigram_vectors_file) logging.info('Done') if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s") parser = argparse.ArgumentParser() parser.add_argument('--stages', choices=('fancy-compose', 'format', 'simple-compose'), nargs='+', required=True, help="""Stages are as follows: - fancy-compose: runs Socher's code (Turian unigrams and Socher composition) - format: converts output of previous stage to Byblo-compatible files - simple-compose: does Add/Mult... composition on Turian unigrams, as converted in previous stage """) args = parser.parse_args() if 'fancy-compose' in args.stages: run_socher_code() if 'format' in args.stages: # write just the unigram vectors for other composers to use write_clean_turian_unigrams() reformat_socher_vectors() if 'simple-compose' in args.stages: composers = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer, RightmostWordComposer, VerbComposer] compose_and_write_vectors(turian_unigram_vectors_file, 'turian', composers, output_dir=output_dir, gzipped=False, dense_hd5=True)
glove_script = join(args.glove_dir, 'demo2.sh') # TODO explain how to set param in that script if args.corpus == 'wiki': # todo explain what these are and why formatting is needed pos_only_data_dir = join(prefix, 'data/wiki/') unlabelled_data = join(prefix, 'data/wikipedia.oneline') else: pos_only_data_dir = join(prefix, 'data/gigaw/') unlabelled_data = join(prefix, 'data/gigaw.oneline') raw_vectors_file = join(args.glove_dir, 'vectors.txt') # what GloVe produces formatted_vectors_file = join(output_dir, 'vectors.%s.h5' % args.corpus) # unigram vectors in my format # DO THE ACTUAL WORK composer_algos = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer, RightmostWordComposer, VerbComposer] if 'reformat' in args.stages: reformat_data() if 'vectors' in args.stages: run_glove() if 'compose' in args.stages: logging.info('Loading labelled corpora and composing phrase vectors therein') compose_and_write_vectors(formatted_vectors_file, 'glove-%s'%args.corpus, composer_algos, output_dir=output_dir, dense_hd5=True)