コード例 #1
0
def hash_sentences_for_day(date):
    hashed_sentences = {}
    sentence_files = cand_sentences.grouped_sentence_files_by_date()[date]    
    for sentence_file in sentence_files:
        for sentence in open(
            cand_sentences.SENTENCES_DIR + '/' + sentence_file):
            hashed_sentences[sentence_hash(sentence)] = sentence
    return hashed_sentences
コード例 #2
0
def main():
    intrade_parser = intrade_data_parser.IntradeDataParser()
    feature_indexer = FeatureIndexer()
    
    print 'reading sentences'
    grouped_sentence_files = cand_sentences.grouped_sentence_files_by_date()
    for k, v in grouped_sentence_files.iteritems():
        grouped_sentence_files[k] = [x for x in v if 'romney' in x]
    #while len(grouped_sentence_files) > 10:
    #    grouped_sentence_files.popitem()

    print 'precomputing feature sizes'
    precompute_feature_indexer_size(feature_indexer, grouped_sentence_files)

    doc_reader = DocumentReader()
    stopwords = set(map(str.strip, open('english.stop').readlines()))
    stopworded_encoder = BagOfWordsDocumentEncoder(stopwords, feature_indexer,
                                                   doc_reader)
    plain_encoder = BagOfWordsDocumentEncoder(set(), feature_indexer,
                                              doc_reader)
    stopworded_trend_encoder3_10 = TrendingEncoder(stopworded_encoder, 3, 10)
    plain_trend_encoder3_10 = TrendingEncoder(plain_encoder, 3, 10)
    learners = [
        SvmPredictor(stopworded_encoder, 'stopword_svm'), 
        #SvmPredictor(plain_encoder, 'plain_svm'),
        #SvmPredictor(stopworded_trend_encoder3_10, 'stop_trend_3_10'),
        SvmPredictor(plain_trend_encoder3_10, 'plain_trend_3_10'),
        #VorpalCandPricePredictor(stopworded_encoder, 'stop_vow_all')
        ]
    for i in range(100):
        fake_learner = FakeLearner()
        learners.append(fake_learner)
    eval_stats = [EvaluationStats(learner, doc_reader) for learner in learners]

    training_data = []
    labels = []
    dates_to_skip = 5

    for date in sorted(grouped_sentence_files):
        if date[-1] == '2' or date[-1] == '7':
            continue
        #if dates_to_skip <= -5: break
        print date
        for file_name in grouped_sentence_files[date]:
            cand = file_name[:file_name.find('.'):]
            cur, next = intrade_parser.get_cur_and_next_price(cand, date)
            change = next - cur
            training_data.append((cand, date))
            labels.append(change)
            if dates_to_skip > 0:
                continue

            for learner, eval_stat in zip(learners, eval_stats):
                eval_stat.eval_prediction(change, cand, date)

        dates_to_skip -= 1
        if dates_to_skip <= 0:
            for learner in learners:
                learner.fit(training_data, labels)

    coefs = []
    print 'num features', feature_indexer.num_features()
    # print 'coef size', len(learners[0].learner.coef_[0])
    # for ind in range(feature_indexer.num_features()):
    #     coefs.append((feature_indexer.get_ith_feature_name(ind), 
    #                   learners[0].learner.coef_[0][ind]))
    # coefs.sort(key = lambda x: -abs(x[1]))
    # for feature, weight in coefs[:500]:
    #     print feature, weight
    
    eval_stats.sort(key=lambda x: -x.profit())
    for ind, e in enumerate(eval_stats):
        if e.learner.name != 'fake':
            print e.learner.name, 'profit', ind, float(ind) / len(eval_stats), e.profit()
    eval_stats.sort(key=lambda x: -x.accuracy())
    for ind, e in enumerate(eval_stats):
        if e.learner.name != 'fake':
            print e.learner.name, 'accuracy', ind, float(ind) / len(eval_stats), e.accuracy()