def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''): # Step 1: extract unigram distributions for words unigram_alph = CPPUniAlphabet() unigram_alph.fromfile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, )))) unigram_alph.growing = False bigram_alph = CPPUniAlphabet() bigram_alph.fromfile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, )))) bigram_alph.growing = False infix = '_'.join(prefix_l) if infix != '': infix = '_' + infix if opts.limit != -1: prefix_l.append('%d' % (opts.limit / 1000)) word_matrix = None word_alphs = get_word_alphs_by_pos(language) for word_pos in pos_tags: word_alph = word_alphs[word_pos] word_feat_alph = CPPUniAlphabet() for corpus_name in corpora: corpus = Corpus(corpus_name) att = corpus.attribute(opts.attr_name, 'p') att_find = corpus.attribute('tb_lemma', 'p') att_sent = corpus.attribute('s', 's') pair_alphs = get_pair_alphs_by_pos(opts.language) word_alphs = get_word_alphs_by_pos(opts.language) print "word features for %s in %s" % (word_pos, corpus_name) wmat = gather_word_vectors(list(word_alph), att, att_find, att_sent, unigram_alph, bigram_alph, word_feat_alph, forward_mapping_by_pos(word_pos), opts.limit) if word_matrix is None: word_matrix = wmat else: word_matrix += wmat word_feat_alph.tofile_utf8( file( os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % ( infix, word_pos, )), 'w')) word_matrix.write_binary( file( os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % ( infix, word_pos, )), 'w'))
def __missing__(self, k): fname=self.pat%{'pos_tag':k} alph=CPPUniAlphabet(want_utf8=self.want_utf8) print >>sys.stderr, "[FilePatternDict] load %s"%(fname,) alph.fromfile_utf8(file(fname)) alph.growing=False self[k]=alph return alph
def create_bow_pair(corpora, language, pos_pairs, outdir='.', alph_suffix=''): unigram_alph = CPPUniAlphabet() unigram_alph.fromfile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, )))) unigram_alph.growing = False bigram_alph = CPPUniAlphabet() bigram_alph.fromfile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, )))) bigram_alph.growing = False if opts.limit != -1: prefix_l.append('%d' % (opts.limit / 1000)) pair_alphs = get_pair_alphs_by_pos(language) for word_pos in pos_pairs: pair_alph = pair_alphs[word_pos] pair_feat_alph = CPPUniAlphabet() word_matrix = None for corpus_name in corpora: print "word pair features for %s" % (pos_pair, ) pair_feat_alph = CPPUniAlphabet() for corpus_name in corpora: wmat = gather_pair_vectors( [x.split('_', 1) for x in pair_alph], att, att_find, att_sent, unigram_alph, bigram_alph, pair_feat_alph, forward_mapping_by_pos(pos_pair[0]), forward_mapping_by_pos(pos_pair[1]), opts.limit) if word_matrix is None: word_matrix = wmat else: word_matrix += wmat pair_feat_alph.tofile_utf8( file('pair_bow%s%s_alph.txt' % ( infix, pos_pair, ), 'w')) word_matrix.write_binary( file('pair_bow%s%s_mtx.bin' % ( infix, pos_pair, ), 'w'))
def read_input_pairs(f): alph = CPPUniAlphabet() alph_w = CPPUniAlphabet() word_pairs = [] for l in f: line = l.strip().split() word1 = line[3] word2 = line[0] alph[u'%s_%s' % (word1, word2)] alph_w[word1] alph_w[word2] word_pairs.append((word1, word2)) alph.growing = False #stick to known word pairs return alph, alph_w, word_pairs