Exemple #1
0
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''):
    # Step 1: extract unigram distributions for words
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    infix = '_'.join(prefix_l)
    if infix != '': infix = '_' + infix
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    word_matrix = None
    word_alphs = get_word_alphs_by_pos(language)
    for word_pos in pos_tags:
        word_alph = word_alphs[word_pos]
        word_feat_alph = CPPUniAlphabet()
        for corpus_name in corpora:
            corpus = Corpus(corpus_name)
            att = corpus.attribute(opts.attr_name, 'p')
            att_find = corpus.attribute('tb_lemma', 'p')
            att_sent = corpus.attribute('s', 's')
            pair_alphs = get_pair_alphs_by_pos(opts.language)
            word_alphs = get_word_alphs_by_pos(opts.language)
            print "word features for %s in %s" % (word_pos, corpus_name)
            wmat = gather_word_vectors(list(word_alph), att, att_find,
                                       att_sent, unigram_alph, bigram_alph,
                                       word_feat_alph,
                                       forward_mapping_by_pos(word_pos),
                                       opts.limit)
            if word_matrix is None:
                word_matrix = wmat
            else:
                word_matrix += wmat
        word_feat_alph.tofile_utf8(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % (
                    infix,
                    word_pos,
                )), 'w'))
        word_matrix.write_binary(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % (
                    infix,
                    word_pos,
                )), 'w'))
Exemple #2
0
 def __missing__(self, k):
     fname=self.pat%{'pos_tag':k}
     alph=CPPUniAlphabet(want_utf8=self.want_utf8)
     print >>sys.stderr, "[FilePatternDict] load %s"%(fname,)
     alph.fromfile_utf8(file(fname))
     alph.growing=False
     self[k]=alph
     return alph
Exemple #3
0
def create_bow_pair(corpora, language, pos_pairs, outdir='.', alph_suffix=''):
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    pair_alphs = get_pair_alphs_by_pos(language)
    for word_pos in pos_pairs:
        pair_alph = pair_alphs[word_pos]
        pair_feat_alph = CPPUniAlphabet()
        word_matrix = None
        for corpus_name in corpora:
            print "word pair features for %s" % (pos_pair, )
            pair_feat_alph = CPPUniAlphabet()
            for corpus_name in corpora:
                wmat = gather_pair_vectors(
                    [x.split('_', 1) for x in pair_alph], att, att_find,
                    att_sent, unigram_alph, bigram_alph, pair_feat_alph,
                    forward_mapping_by_pos(pos_pair[0]),
                    forward_mapping_by_pos(pos_pair[1]), opts.limit)
                if word_matrix is None:
                    word_matrix = wmat
                else:
                    word_matrix += wmat
        pair_feat_alph.tofile_utf8(
            file('pair_bow%s%s_alph.txt' % (
                infix,
                pos_pair,
            ), 'w'))
        word_matrix.write_binary(
            file('pair_bow%s%s_mtx.bin' % (
                infix,
                pos_pair,
            ), 'w'))
Exemple #4
0
def read_input_pairs(f):
    alph = CPPUniAlphabet()
    alph_w = CPPUniAlphabet()
    word_pairs = []
    for l in f:
        line = l.strip().split()
        word1 = line[3]
        word2 = line[0]
        alph[u'%s_%s' % (word1, word2)]
        alph_w[word1]
        alph_w[word2]
        word_pairs.append((word1, word2))
    alph.growing = False  #stick to known word pairs
    return alph, alph_w, word_pairs