def build_full_composed_thesauri_with_baroni_and_svd(args): # SET UP A FEW REQUIRED PATHS byblo_opts, _ = parse_byblo_conf_file(args.conf) input_file_name = os.path.basename(byblo_opts.input) # INPUT 1: DIRECTORY. Must contain a single conf file unigram_vectors_dir = os.path.abspath(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir) unigram_vectors_dir_ppmi = '%s-ppmi' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir_ppmi) unigram_vectors_dir_ppmi_svd = '%s-ppmi-svd' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(unigram_vectors_dir_ppmi_svd) # INPUT 2: A FILE, TSV, underscore-separated observed vectors for ANs and NNs SVD_DIMS = 100 ngram_vectors_dir = '%s-ppmi-svd-composed' % os.path.dirname(byblo_opts.output) mkdirs_if_not_exists(ngram_vectors_dir) composer_algos = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer, VerbComposer, RightmostWordComposer] # EXTRACT UNIGRAM VECTORS WITH BYBLO if 'unigrams' in args.stages: calculate_unigram_vectors(os.path.abspath(args.conf), os.path.abspath(args.byblo)) else: logging.warning('Skipping unigrams stage. Assuming output is at %s', byblo_opts.output) # FEATURE REWEIGHTING- will always be performed if 'ppmi' in args.stages: _do_ppmi(_find_events_file(byblo_opts.output), unigram_vectors_dir_ppmi) # REDUCE DIMENSIONALITY # add in observed AN/NN vectors for SVD processing. Reduce both unigram vectors and observed phrase vectors # together and put the output into the same file unreduced_unigram_events_file = _find_events_file(unigram_vectors_dir_ppmi) # ...exp6-12/exp6.events.filtered.strings --> ...exp6-12/exp6 reduced_file_prefix = join(unigram_vectors_dir_ppmi_svd, input_file_name) # only keep the most frequent types per PoS tag to speed things up counts = [('N', 200000), ('V', 200000), ('J', 100000), ('RB', 0), ('AN', 0), ('NN', 0)] if 'svd' in args.stages: # in this case the name exp%d-with-obs-phrases is massively misleading because # there aren't any obs phrase vectors # let's just do SVD on the unigram phrases so we can compose them simply later do_svd(unreduced_unigram_events_file, reduced_file_prefix, desired_counts_per_feature_type=counts, reduce_to=[SVD_DIMS]) else: logging.warning('Skipping SVD stage. Assuming output is at %s-SVD*', reduced_file_prefix) # construct the names of files output by do_svd all_reduced_vectors = '%s-SVD%d.events.filtered.strings' % (reduced_file_prefix, SVD_DIMS) if 'compose' in args.stages: # it is OK for the first parameter to contain phrase vectors, there is explicit filtering coming up # the assumption is these are actually observed phrasal vectors compose_and_write_vectors(all_reduced_vectors, '%s-%s' % (input_file_name, SVD_DIMS), composer_algos, output_dir=ngram_vectors_dir, dense_hd5=True) else: logging.warning('Skipping composition stage. Assuming output is at %s', ngram_vectors_dir)
def calculate_unigram_vectors(byblo_conf_file, byblo_base_dir): # find out where the conf file said output should go opts, _ = parse_byblo_conf_file(byblo_conf_file) byblo_output_prefix = join(opts.output, basename(opts.input)) # get byblo to calculate vectors for all entries set_stage_in_byblo_conf_file(byblo_conf_file, 1) with temp_chdir(byblo_base_dir): run_byblo(byblo_conf_file) set_stage_in_byblo_conf_file(byblo_conf_file, 0) # get vectors as strings unindex_all_byblo_vectors(byblo_output_prefix)