def configure(config): global score, train_tfidf score = tfidf.scorer(config['tfidf_model'], config['tfidf_dict']) logging.info('Loading training corpus TF-IDF vectors') with open(config['train_tfidf']) as fp: train_tfidf = cPickle.load(fp) logging.info('Read vectors for %d sentences | vocabulary size = %d', train_tfidf.shape[1], train_tfidf.shape[0])
def main(model, dic, corpus, output): logging.basicConfig(level=logging.INFO) score = tfidf.scorer(model, dic) transforms = [] with open(corpus) as fp: n_sentences = sum(1 for line in fp) logging.info('Computing tf-idf vectors for %d sentences', n_sentences) bar = pb.ProgressBar(widgets=[pb.Percentage(), pb.Bar(), pb.ETA()], maxval=n_sentences) with open(corpus) as fp: for sentence in bar(fp): transforms.append(score(sentence.split())) logging.info('Saving tf-idf information to %s', output) with open(output, 'w') as fp: cPickle.dump(corpus2csc(transforms), fp, protocol=cPickle.HIGHEST_PROTOCOL)