Exemple #1
0
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''):
    # Step 1: extract unigram distributions for words
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    infix = '_'.join(prefix_l)
    if infix != '': infix = '_' + infix
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    word_matrix = None
    word_alphs = get_word_alphs_by_pos(language)
    for word_pos in pos_tags:
        word_alph = word_alphs[word_pos]
        word_feat_alph = CPPUniAlphabet()
        for corpus_name in corpora:
            corpus = Corpus(corpus_name)
            att = corpus.attribute(opts.attr_name, 'p')
            att_find = corpus.attribute('tb_lemma', 'p')
            att_sent = corpus.attribute('s', 's')
            pair_alphs = get_pair_alphs_by_pos(opts.language)
            word_alphs = get_word_alphs_by_pos(opts.language)
            print "word features for %s in %s" % (word_pos, corpus_name)
            wmat = gather_word_vectors(list(word_alph), att, att_find,
                                       att_sent, unigram_alph, bigram_alph,
                                       word_feat_alph,
                                       forward_mapping_by_pos(word_pos),
                                       opts.limit)
            if word_matrix is None:
                word_matrix = wmat
            else:
                word_matrix += wmat
        word_feat_alph.tofile_utf8(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % (
                    infix,
                    word_pos,
                )), 'w'))
        word_matrix.write_binary(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % (
                    infix,
                    word_pos,
                )), 'w'))
Exemple #2
0
def create_bow_pair(corpora, language, pos_pairs, outdir='.', alph_suffix=''):
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    pair_alphs = get_pair_alphs_by_pos(language)
    for word_pos in pos_pairs:
        pair_alph = pair_alphs[word_pos]
        pair_feat_alph = CPPUniAlphabet()
        word_matrix = None
        for corpus_name in corpora:
            print "word pair features for %s" % (pos_pair, )
            pair_feat_alph = CPPUniAlphabet()
            for corpus_name in corpora:
                wmat = gather_pair_vectors(
                    [x.split('_', 1) for x in pair_alph], att, att_find,
                    att_sent, unigram_alph, bigram_alph, pair_feat_alph,
                    forward_mapping_by_pos(pos_pair[0]),
                    forward_mapping_by_pos(pos_pair[1]), opts.limit)
                if word_matrix is None:
                    word_matrix = wmat
                else:
                    word_matrix += wmat
        pair_feat_alph.tofile_utf8(
            file('pair_bow%s%s_alph.txt' % (
                infix,
                pos_pair,
            ), 'w'))
        word_matrix.write_binary(
            file('pair_bow%s%s_mtx.bin' % (
                infix,
                pos_pair,
            ), 'w'))