예제 #1
0
 def __missing__(self, k):
     fname=self.pat%{'pos_tag':k}
     alph=CPPUniAlphabet(want_utf8=self.want_utf8)
     print >>sys.stderr, "[FilePatternDict] load %s"%(fname,)
     alph.fromfile_utf8(file(fname))
     alph.growing=False
     self[k]=alph
     return alph
예제 #2
0
파일: bow_learn.py 프로젝트: yv/MLTK_Qualia
def read_input_pairs(f):
    alph = CPPUniAlphabet()
    alph_w = CPPUniAlphabet()
    word_pairs = []
    for l in f:
        line = l.strip().split()
        word1 = line[3]
        word2 = line[0]
        alph[u'%s_%s' % (word1, word2)]
        alph_w[word1]
        alph_w[word2]
        word_pairs.append((word1, word2))
    alph.growing = False  #stick to known word pairs
    return alph, alph_w, word_pairs
예제 #3
0
def compile_alphabets(language, suffix='', wanted_alphs=None):
    pair_pat=get_config_var('dist_sim.$lang.pair_alph_pattern',{'lang':language})
    word_pat=get_config_var('dist_sim.$lang.word_alph_pattern',{'lang':language})
    wanted_words=defaultdict(set)
    conf=get_config_var('dist_sim.'+language)
    for name,cf in conf['datasets'].iteritems():
        print >>sys.stderr, language, name
        dat=Dataset(name,cf,language)
        dat.add_to_vocabulary(wanted_words)
    print >>sys.stderr, "Saving",
    for k,v in wanted_words.iteritems():
        if wanted_alphs is not None and k not in wanted_alphs:
            continue
        print >>sys.stderr, k,
        if len(k)==1:
            fname=word_pat%{'pos_tag':k}+suffix
        else:
            fname=pair_pat%{'pos_tag':k}+suffix
        alph=CPPUniAlphabet(want_utf8=True)
        for word in v:
            alph[word]
            alph.tofile(file(fname,'w'))
    print >>sys.stderr
예제 #4
0
 def load_target_alph(self):
     alph=CPPUniAlphabet()
     alph.fromfile(self.open_by_pat('target_alph'))
     return alph
예제 #5
0
 def load_component_alph(self,name):
     alph=CPPUniAlphabet()
     alph.fromfile(self.open_by_pat('component_alph',matrix_name=name))
     if self.max_range is None:
         self.max_range=len(alph)
     return alph
예제 #6
0
 def load_component_alph(self):
     name=self.matrix_name
     alph=CPPUniAlphabet()
     alph.fromfile(self.open_by_pat('component_alph',matrix_name=name))
     return alph
예제 #7
0
파일: bow_learn.py 프로젝트: yv/MLTK_Qualia
def create_bow_pair(corpora, language, pos_pairs, outdir='.', alph_suffix=''):
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    pair_alphs = get_pair_alphs_by_pos(language)
    for word_pos in pos_pairs:
        pair_alph = pair_alphs[word_pos]
        pair_feat_alph = CPPUniAlphabet()
        word_matrix = None
        for corpus_name in corpora:
            print "word pair features for %s" % (pos_pair, )
            pair_feat_alph = CPPUniAlphabet()
            for corpus_name in corpora:
                wmat = gather_pair_vectors(
                    [x.split('_', 1) for x in pair_alph], att, att_find,
                    att_sent, unigram_alph, bigram_alph, pair_feat_alph,
                    forward_mapping_by_pos(pos_pair[0]),
                    forward_mapping_by_pos(pos_pair[1]), opts.limit)
                if word_matrix is None:
                    word_matrix = wmat
                else:
                    word_matrix += wmat
        pair_feat_alph.tofile_utf8(
            file('pair_bow%s%s_alph.txt' % (
                infix,
                pos_pair,
            ), 'w'))
        word_matrix.write_binary(
            file('pair_bow%s%s_mtx.bin' % (
                infix,
                pos_pair,
            ), 'w'))
예제 #8
0
파일: bow_learn.py 프로젝트: yv/MLTK_Qualia
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''):
    # Step 1: extract unigram distributions for words
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    infix = '_'.join(prefix_l)
    if infix != '': infix = '_' + infix
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    word_matrix = None
    word_alphs = get_word_alphs_by_pos(language)
    for word_pos in pos_tags:
        word_alph = word_alphs[word_pos]
        word_feat_alph = CPPUniAlphabet()
        for corpus_name in corpora:
            corpus = Corpus(corpus_name)
            att = corpus.attribute(opts.attr_name, 'p')
            att_find = corpus.attribute('tb_lemma', 'p')
            att_sent = corpus.attribute('s', 's')
            pair_alphs = get_pair_alphs_by_pos(opts.language)
            word_alphs = get_word_alphs_by_pos(opts.language)
            print "word features for %s in %s" % (word_pos, corpus_name)
            wmat = gather_word_vectors(list(word_alph), att, att_find,
                                       att_sent, unigram_alph, bigram_alph,
                                       word_feat_alph,
                                       forward_mapping_by_pos(word_pos),
                                       opts.limit)
            if word_matrix is None:
                word_matrix = wmat
            else:
                word_matrix += wmat
        word_feat_alph.tofile_utf8(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % (
                    infix,
                    word_pos,
                )), 'w'))
        word_matrix.write_binary(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % (
                    infix,
                    word_pos,
                )), 'w'))