def __missing__(self, k): fname=self.pat%{'pos_tag':k} alph=CPPUniAlphabet(want_utf8=self.want_utf8) print >>sys.stderr, "[FilePatternDict] load %s"%(fname,) alph.fromfile_utf8(file(fname)) alph.growing=False self[k]=alph return alph
def read_input_pairs(f): alph = CPPUniAlphabet() alph_w = CPPUniAlphabet() word_pairs = [] for l in f: line = l.strip().split() word1 = line[3] word2 = line[0] alph[u'%s_%s' % (word1, word2)] alph_w[word1] alph_w[word2] word_pairs.append((word1, word2)) alph.growing = False #stick to known word pairs return alph, alph_w, word_pairs
def compile_alphabets(language, suffix='', wanted_alphs=None): pair_pat=get_config_var('dist_sim.$lang.pair_alph_pattern',{'lang':language}) word_pat=get_config_var('dist_sim.$lang.word_alph_pattern',{'lang':language}) wanted_words=defaultdict(set) conf=get_config_var('dist_sim.'+language) for name,cf in conf['datasets'].iteritems(): print >>sys.stderr, language, name dat=Dataset(name,cf,language) dat.add_to_vocabulary(wanted_words) print >>sys.stderr, "Saving", for k,v in wanted_words.iteritems(): if wanted_alphs is not None and k not in wanted_alphs: continue print >>sys.stderr, k, if len(k)==1: fname=word_pat%{'pos_tag':k}+suffix else: fname=pair_pat%{'pos_tag':k}+suffix alph=CPPUniAlphabet(want_utf8=True) for word in v: alph[word] alph.tofile(file(fname,'w')) print >>sys.stderr
def load_target_alph(self): alph=CPPUniAlphabet() alph.fromfile(self.open_by_pat('target_alph')) return alph
def load_component_alph(self,name): alph=CPPUniAlphabet() alph.fromfile(self.open_by_pat('component_alph',matrix_name=name)) if self.max_range is None: self.max_range=len(alph) return alph
def load_component_alph(self): name=self.matrix_name alph=CPPUniAlphabet() alph.fromfile(self.open_by_pat('component_alph',matrix_name=name)) return alph
def create_bow_pair(corpora, language, pos_pairs, outdir='.', alph_suffix=''): unigram_alph = CPPUniAlphabet() unigram_alph.fromfile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, )))) unigram_alph.growing = False bigram_alph = CPPUniAlphabet() bigram_alph.fromfile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, )))) bigram_alph.growing = False if opts.limit != -1: prefix_l.append('%d' % (opts.limit / 1000)) pair_alphs = get_pair_alphs_by_pos(language) for word_pos in pos_pairs: pair_alph = pair_alphs[word_pos] pair_feat_alph = CPPUniAlphabet() word_matrix = None for corpus_name in corpora: print "word pair features for %s" % (pos_pair, ) pair_feat_alph = CPPUniAlphabet() for corpus_name in corpora: wmat = gather_pair_vectors( [x.split('_', 1) for x in pair_alph], att, att_find, att_sent, unigram_alph, bigram_alph, pair_feat_alph, forward_mapping_by_pos(pos_pair[0]), forward_mapping_by_pos(pos_pair[1]), opts.limit) if word_matrix is None: word_matrix = wmat else: word_matrix += wmat pair_feat_alph.tofile_utf8( file('pair_bow%s%s_alph.txt' % ( infix, pos_pair, ), 'w')) word_matrix.write_binary( file('pair_bow%s%s_mtx.bin' % ( infix, pos_pair, ), 'w'))
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''): # Step 1: extract unigram distributions for words unigram_alph = CPPUniAlphabet() unigram_alph.fromfile( file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, )))) unigram_alph.growing = False bigram_alph = CPPUniAlphabet() bigram_alph.fromfile( file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, )))) bigram_alph.growing = False infix = '_'.join(prefix_l) if infix != '': infix = '_' + infix if opts.limit != -1: prefix_l.append('%d' % (opts.limit / 1000)) word_matrix = None word_alphs = get_word_alphs_by_pos(language) for word_pos in pos_tags: word_alph = word_alphs[word_pos] word_feat_alph = CPPUniAlphabet() for corpus_name in corpora: corpus = Corpus(corpus_name) att = corpus.attribute(opts.attr_name, 'p') att_find = corpus.attribute('tb_lemma', 'p') att_sent = corpus.attribute('s', 's') pair_alphs = get_pair_alphs_by_pos(opts.language) word_alphs = get_word_alphs_by_pos(opts.language) print "word features for %s in %s" % (word_pos, corpus_name) wmat = gather_word_vectors(list(word_alph), att, att_find, att_sent, unigram_alph, bigram_alph, word_feat_alph, forward_mapping_by_pos(word_pos), opts.limit) if word_matrix is None: word_matrix = wmat else: word_matrix += wmat word_feat_alph.tofile_utf8( file( os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % ( infix, word_pos, )), 'w')) word_matrix.write_binary( file( os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % ( infix, word_pos, )), 'w'))