Python compose_and_write_vectorsの例、builder.composers.vectorstore.compose_and_write_vectors Pythonの例

コード例 #1

0

ファイルを表示

ファイル: build_phrasal_thesauri_offline.py プロジェクト: mbatchkarov/vector_builder

def build_full_composed_thesauri_with_baroni_and_svd(args):
    # SET UP A FEW REQUIRED PATHS

    byblo_opts, _ = parse_byblo_conf_file(args.conf)
    input_file_name = os.path.basename(byblo_opts.input)
    # INPUT 1:  DIRECTORY. Must contain a single conf file
    unigram_vectors_dir = os.path.abspath(byblo_opts.output)
    mkdirs_if_not_exists(unigram_vectors_dir)
    unigram_vectors_dir_ppmi = '%s-ppmi' % os.path.dirname(byblo_opts.output)
    mkdirs_if_not_exists(unigram_vectors_dir_ppmi)
    unigram_vectors_dir_ppmi_svd = '%s-ppmi-svd' % os.path.dirname(byblo_opts.output)
    mkdirs_if_not_exists(unigram_vectors_dir_ppmi_svd)

    # INPUT 2: A FILE, TSV, underscore-separated observed vectors for ANs and NNs
    SVD_DIMS = 100

    ngram_vectors_dir = '%s-ppmi-svd-composed' % os.path.dirname(byblo_opts.output)
    mkdirs_if_not_exists(ngram_vectors_dir)
    composer_algos = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer,
                      VerbComposer, RightmostWordComposer]

    # EXTRACT UNIGRAM VECTORS WITH BYBLO
    if 'unigrams' in args.stages:
        calculate_unigram_vectors(os.path.abspath(args.conf), os.path.abspath(args.byblo))
    else:
        logging.warning('Skipping unigrams stage. Assuming output is at %s',
                        byblo_opts.output)

    # FEATURE REWEIGHTING- will always be performed
    if 'ppmi' in args.stages:
        _do_ppmi(_find_events_file(byblo_opts.output), unigram_vectors_dir_ppmi)

    # REDUCE DIMENSIONALITY
    # add in observed AN/NN vectors for SVD processing. Reduce both unigram vectors and observed phrase vectors
    # together and put the output into the same file
    unreduced_unigram_events_file = _find_events_file(unigram_vectors_dir_ppmi)
    # ...exp6-12/exp6.events.filtered.strings --> ...exp6-12/exp6
    reduced_file_prefix = join(unigram_vectors_dir_ppmi_svd, input_file_name)
    # only keep the most frequent types per PoS tag to speed things up
    counts = [('N', 200000), ('V', 200000), ('J', 100000), ('RB', 0), ('AN', 0), ('NN', 0)]
    if 'svd' in args.stages:
        # in this case the name exp%d-with-obs-phrases is massively misleading because
        # there aren't any obs phrase vectors
        # let's just do SVD on the unigram phrases so we can compose them simply later
        do_svd(unreduced_unigram_events_file, reduced_file_prefix,
               desired_counts_per_feature_type=counts, reduce_to=[SVD_DIMS])
    else:
        logging.warning('Skipping SVD stage. Assuming output is at %s-SVD*', reduced_file_prefix)

    # construct the names of files output by do_svd
    all_reduced_vectors = '%s-SVD%d.events.filtered.strings' % (reduced_file_prefix, SVD_DIMS)

    if 'compose' in args.stages:
        # it is OK for the first parameter to contain phrase vectors, there is explicit filtering coming up
        # the assumption is these are actually observed phrasal vectors
        compose_and_write_vectors(all_reduced_vectors,
                                  '%s-%s' % (input_file_name, SVD_DIMS),
                                  composer_algos, output_dir=ngram_vectors_dir, dense_hd5=True)
    else:
        logging.warning('Skipping composition stage. Assuming output is at %s', ngram_vectors_dir)

コード例 #2

0

ファイルを表示

ファイル: categorical_composers.py プロジェクト: mbatchkarov/vector_builder

    nouns_wins_wiki = os.path.join(FET, 'exp11-13b-ppmi/exp11-with-obs-phrases-SVD100.events.filtered.strings')
    nouns_w2v_gigaw_100 = os.path.join(FET, 'word2vec_vectors/word2vec-gigaw-100perc.unigr.strings.rep0')
    nouns_w2v_wiki_15 = os.path.join(FET, 'word2vec_vectors/word2vec-wiki-15perc.unigr.strings.rep0')
    nouns_w2v_wiki_100 = os.path.join(FET, 'word2vec_vectors/word2vec-wiki-100perc.unigr.strings.rep0')
    nouns_glove_wiki_100 = os.path.join(FET, 'glove/vectors.miro.h5')
    all_nouns = [nouns_wins_wiki, nouns_w2v_gigaw_100, nouns_w2v_wiki_15, nouns_w2v_wiki_100, nouns_glove_wiki_100]

    names_composed_files = ['wiki-wins-100', 'gigaw-w2v-100', 'wiki-w2v-15', 'wiki-w2v-100', 'wiki-glove-100']
    save_files = ['%s-vector-matrices.hdf' % x for x in names_composed_files]
    for noun_path, save_file, sname in zip(all_nouns,
                                           save_files,
                                           names_composed_files):
        trained_verb_matrices_file = os.path.join(VERBS_HDF_DIR, save_file)

        # the list of subject/objects of a given verb is determined from the unlabelled corpus,
        # and so are the noun vectors
        if 'wiki' in sname:
            svos_path = '/lustre/scratch/inf/mmb28/DiscoUtils/wiki_NPs_in_MR_R2_TechTC_am_maas.uniq.10.txt'
        elif 'giga' in sname:
            svos_path = '/lustre/scratch/inf/mmb28/DiscoUtils/gigaw_NPs_in_MR_R2_TechTC_am_maas.uniq.10.txt'
        else:
            raise ValueError('What unlabelled corpus is this???')
        train_verb_tensors(svos_path, noun_path, trained_verb_matrices_file)

        compose_and_write_vectors(noun_path,  # a vector store containing noun vectors
                                  sname,  # something to identify the source of unigram vectors
                                  [CopyObject, FrobeniusAdd, FrobeniusMult],
                                  # filename of output of training stage
                                  categorical_vector_matrix_file=trained_verb_matrices_file,
                                  output_dir=VERBS_HDF_DIR, dense_hd5=True)

コード例 #3

0

ファイルを表示

ファイル: get_word2vec_vectors.py プロジェクト: mbatchkarov/vector_builder

def compute_and_write_vectors(corpus_name, stages, percent, repeat, remove_pos):
    prefix = os.path.abspath(os.path.join(__file__, '..', '..'))
    output_dir = join(prefix, 'outputs', 'word2vec')
    mkdirs_if_not_exists(output_dir)

    # inputs
    conll_data_dir = join(prefix, 'data/%s-conll' % corpus_name)
    # outputs
    if remove_pos:
        text_only_data_dir = join(prefix, 'data/%s-nopos' % corpus_name)
        unigram_events_file = join(output_dir, '%s-nopos-%dperc.unigr.strings'%(corpus_name, percent))
    else:
        text_only_data_dir = join(prefix, 'data/%s' % corpus_name)
        unigram_events_file = join(output_dir, '%s-%dperc.unigr.strings'%(corpus_name, percent))

    if percent > 90 and repeat > 1:
        raise ValueError('Repeating with a different sample of corpus only makes sense when '
                         'the samples are sufficiently distinct. This requires that the sample'
                         ' size is fairly small to minimise overlap between samples')

    if 'reformat' in stages:
        reformat_data(conll_data_dir, text_only_data_dir, remove_pos)

    if 'vectors' in stages:
        models = [_train_model(percent, text_only_data_dir, i, remove_pos) for i in range(repeat)]

        vectors = []
        # write the output of each run separately
        for i in range(repeat):
            output_path = unigram_events_file + '.rep%d' % i
            vectors.append(write_gensim_vectors_to_tsv(models[i], output_path))

        if 'average' in stages and repeat > 1:
            # average vectors and append to list to be written
            shared_vocab = set.intersection(*[set(model.vocab.keys()) for model in models])
            output_path = unigram_events_file + '.avg%d' % repeat
            model = {}
            for k in shared_vocab:
                model[k] = reduce(np.add, [m[k] for m in models])
            vectors.append(write_gensim_vectors_to_tsv(model, output_path, vocab=shared_vocab))
    else:
        # let's just pretend something was written above. just need this so the loop below will run
        vectors = [None] * repeat + ([None] if 'average' in stages and repeat > 1 else [])
    if 'compose' in stages:
        for i, v in enumerate(vectors):
            # if we'll also be composing we don't have to write the unigram vectors to disk
            # just to read them back later.
            if 'average' in stages and i == (len(vectors) - 1) and len(vectors) > 1:
                # last set of vectors in the list, these are the averages ones
                out_path = 'word2vec-%s_%dpercent-avg%d' % (corpus_name, percent, repeat)
                input_thing = v if 'vectors' in stages else unigram_events_file + '.avg%d' % repeat
            else:
                out_path = 'word2vec-%s_%dpercent-rep%d' % (corpus_name, percent, i)
                input_thing = v if 'vectors' in stages else unigram_events_file + '.rep%d' % i
            row_filter = default_row_filter_nopos if remove_pos else default_row_filter
            compose_and_write_vectors(input_thing,
                                      out_path,
                                      composer_algos,
                                      output_dir=output_dir,
                                      row_filter=row_filter,
                                      remove_pos=remove_pos,
                                      dense_hd5=True)

コード例 #4

0

ファイルを表示

ファイル: socher_vectors.py プロジェクト: mbatchkarov/vector_builder

    dv.to_tsv(turian_unigram_vectors_file)
    logging.info('Done')

if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s")

    parser = argparse.ArgumentParser()
    parser.add_argument('--stages', choices=('fancy-compose', 'format', 'simple-compose'), nargs='+', required=True,
                        help="""Stages are as follows:
                         - fancy-compose: runs Socher's code (Turian unigrams and Socher composition)
                         - format: converts output of previous stage to Byblo-compatible files
                         - simple-compose: does Add/Mult... composition on Turian unigrams, as converted in
                         previous stage
                        """)
    args = parser.parse_args()

    if 'fancy-compose' in args.stages:
        run_socher_code()
    if 'format' in args.stages:
        # write just the unigram vectors for other composers to use
        write_clean_turian_unigrams()
        reformat_socher_vectors()
    if 'simple-compose' in args.stages:
        composers = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer,
                     RightmostWordComposer, VerbComposer]
        compose_and_write_vectors(turian_unigram_vectors_file,
                                  'turian',
                                  composers,
                                  output_dir=output_dir, gzipped=False, dense_hd5=True)

コード例 #5

0

ファイルを表示

ファイル: get_glove_vectors.py プロジェクト: mbatchkarov/vector_builder

    glove_script = join(args.glove_dir, 'demo2.sh')  # TODO explain how to set param in that script

    if args.corpus == 'wiki':
        # todo explain what these are and why formatting is needed
        pos_only_data_dir = join(prefix, 'data/wiki/')
        unlabelled_data = join(prefix, 'data/wikipedia.oneline')
    else:
        pos_only_data_dir = join(prefix, 'data/gigaw/')
        unlabelled_data = join(prefix, 'data/gigaw.oneline')

    raw_vectors_file = join(args.glove_dir, 'vectors.txt')  # what GloVe produces
    formatted_vectors_file = join(output_dir, 'vectors.%s.h5' % args.corpus)  # unigram vectors in my format

    # DO THE ACTUAL WORK
    composer_algos = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer,
                      RightmostWordComposer, VerbComposer]

    if 'reformat' in args.stages:
        reformat_data()

    if 'vectors' in args.stages:
        run_glove()

    if 'compose' in args.stages:
        logging.info('Loading labelled corpora and composing phrase vectors therein')
        compose_and_write_vectors(formatted_vectors_file,
                                  'glove-%s'%args.corpus,
                                  composer_algos,
                                  output_dir=output_dir,
                                  dense_hd5=True)