Exemple #1
0
def configure(config):
    global score, train_tfidf
    score = tfidf.scorer(config['tfidf_model'], config['tfidf_dict'])
    logging.info('Loading training corpus TF-IDF vectors')
    with open(config['train_tfidf']) as fp:
        train_tfidf = cPickle.load(fp)
    logging.info('Read vectors for %d sentences | vocabulary size = %d',
            train_tfidf.shape[1], train_tfidf.shape[0])
Exemple #2
0
def configure(config):
    global score, train_tfidf
    score = tfidf.scorer(config['tfidf_model'], config['tfidf_dict'])
    logging.info('Loading training corpus TF-IDF vectors')
    with open(config['train_tfidf']) as fp:
        train_tfidf = cPickle.load(fp)
    logging.info('Read vectors for %d sentences | vocabulary size = %d',
                 train_tfidf.shape[1], train_tfidf.shape[0])
Exemple #3
0
def main(model, dic, corpus, output):
    logging.basicConfig(level=logging.INFO)
    score = tfidf.scorer(model, dic)
    transforms = []
    with open(corpus) as fp:
        n_sentences = sum(1 for line in fp)
    logging.info('Computing tf-idf vectors for %d sentences', n_sentences)
    bar = pb.ProgressBar(widgets=[pb.Percentage(), pb.Bar(), pb.ETA()], maxval=n_sentences)
    with open(corpus) as fp:
        for sentence in bar(fp):
            transforms.append(score(sentence.split()))
    logging.info('Saving tf-idf information to %s', output)
    with open(output, 'w') as fp:
        cPickle.dump(corpus2csc(transforms), fp, protocol=cPickle.HIGHEST_PROTOCOL)