Example #1
0
def build_ngram_corpus(args):
    indir = args[2]
    maxngram = int(args[3])
    mydir = indir + args[3] 
    sampled_lines = getfilelines(indir + '/all.sampled', upto=-1)
    print 'sampled lines are loaded'
    lines = filter_sentences(sampled_lines)
    print 'filtering finished'
    if not os.path.exists(mydir): os.makedirs(mydir)
    ngram2freq = calc_ngram_freq(lines, maxngram) 
    voc2idx = build_vocab(ngram2freq, min_freq=5)
    process_corpus(lines, voc2idx, ngram2freq, maxngram, mydir + '/corpus,processed.txt')
    write_vocab(voc2idx, ngram2freq, mydir + '/vocab-count.txt')
Example #2
0
def build_ngram_corpus(args):
    indir = args[2]
    maxngram = int(args[3])
    mydir = indir + args[3]
    sampled_lines = getfilelines(indir + '/all.sampled', upto=-1)
    print 'sampled lines are loaded'
    lines = filter_sentences(sampled_lines)
    print 'filtering finished'
    if not os.path.exists(mydir): os.makedirs(mydir)
    ngram2freq = calc_ngram_freq(lines, maxngram)
    voc2idx = build_vocab(ngram2freq, min_freq=5)
    process_corpus(lines, voc2idx, ngram2freq, maxngram,
                   mydir + '/corpus,processed.txt')
    write_vocab(voc2idx, ngram2freq, mydir + '/vocab-count.txt')
def load_lines(lines_file, e2types, upto=-1):
    logger.info('loading lines from %s ...', lines_file)
    e2lines = defaultdict(list)
    e2freq = defaultdict(lambda: 0)
    t2lines = defaultdict(list)
    c = 0
    lines = getfilelines(lines_file)
    for line in lines:
        parts = line.split('\t')
        if len(parts) != 5:
            print len(parts)
            print line
        assert  len(parts) == 5
        mye = parseents(parts[1])[0]
        text = parts[4].strip()
        e2lines[mye].append(text)
        e2freq[mye] += 1
        t2lines[e2types[mye][0]].append((text,mye)) # Add text to the notable type of mye 
        if c == upto:
            break
        c += 1
    logger.info('... lines loaded')
    
    return (e2lines,t2lines, e2freq)