def build_ngram_corpus(args): indir = args[2] maxngram = int(args[3]) mydir = indir + args[3] sampled_lines = getfilelines(indir + '/all.sampled', upto=-1) print 'sampled lines are loaded' lines = filter_sentences(sampled_lines) print 'filtering finished' if not os.path.exists(mydir): os.makedirs(mydir) ngram2freq = calc_ngram_freq(lines, maxngram) voc2idx = build_vocab(ngram2freq, min_freq=5) process_corpus(lines, voc2idx, ngram2freq, maxngram, mydir + '/corpus,processed.txt') write_vocab(voc2idx, ngram2freq, mydir + '/vocab-count.txt')
def load_lines(lines_file, e2types, upto=-1): logger.info('loading lines from %s ...', lines_file) e2lines = defaultdict(list) e2freq = defaultdict(lambda: 0) t2lines = defaultdict(list) c = 0 lines = getfilelines(lines_file) for line in lines: parts = line.split('\t') if len(parts) != 5: print len(parts) print line assert len(parts) == 5 mye = parseents(parts[1])[0] text = parts[4].strip() e2lines[mye].append(text) e2freq[mye] += 1 t2lines[e2types[mye][0]].append((text,mye)) # Add text to the notable type of mye if c == upto: break c += 1 logger.info('... lines loaded') return (e2lines,t2lines, e2freq)