def main(rpath, thresh, fvocab): """ Build vocab and save it into a pickle file """ vg = VocabGenerator(thresh=thresh) dr = DocReader() flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')] for fname in flist: print "Reading file: {}".format(fname) doc = dr.read(fname) vg.build(doc) vg.filter() vocab = vg.getvocab() print "Vocab size = {}".format(len(vocab)) if not fvocab.endswith('.pickle.gz'): fvocab += '.pickle.gz' vg.savevocab(fvocab) with open('vocab.txt', 'w') as fout: for (feat, idx) in vocab.iteritems(): fout.write(str(feat) + '\t' + str(idx) + '\n')
def main(rpath, fdata, fvocab): """ Create data and dump it into a pickle file """ print('Load vocab ...') vocab = load(gzip.open(fvocab)) dr = DocReader() sg = SampleGenerator(vocab) flist = [join(rpath,fname) for fname in listdir(rpath) if fname.endswith('merge')] for fname in flist: # print "Reading file: {}".format(fname) doc = dr.read(fname) sg.build(doc) M, labels = sg.getmat() print('M.shape = {}, len(labels) = {}'.format(M.shape, len(labels))) data = {'data':M, 'labels':labels} if not fdata.endswith('.pickle.gz'): fdata += '.pickle.gz' with gzip.open(fdata, 'w') as fout: dump(data, fout) print ('Save data into file: {}'.format(fdata))