def hash_sentences_for_day(date): hashed_sentences = {} sentence_files = cand_sentences.grouped_sentence_files_by_date()[date] for sentence_file in sentence_files: for sentence in open( cand_sentences.SENTENCES_DIR + '/' + sentence_file): hashed_sentences[sentence_hash(sentence)] = sentence return hashed_sentences
def main(): intrade_parser = intrade_data_parser.IntradeDataParser() feature_indexer = FeatureIndexer() print 'reading sentences' grouped_sentence_files = cand_sentences.grouped_sentence_files_by_date() for k, v in grouped_sentence_files.iteritems(): grouped_sentence_files[k] = [x for x in v if 'romney' in x] #while len(grouped_sentence_files) > 10: # grouped_sentence_files.popitem() print 'precomputing feature sizes' precompute_feature_indexer_size(feature_indexer, grouped_sentence_files) doc_reader = DocumentReader() stopwords = set(map(str.strip, open('english.stop').readlines())) stopworded_encoder = BagOfWordsDocumentEncoder(stopwords, feature_indexer, doc_reader) plain_encoder = BagOfWordsDocumentEncoder(set(), feature_indexer, doc_reader) stopworded_trend_encoder3_10 = TrendingEncoder(stopworded_encoder, 3, 10) plain_trend_encoder3_10 = TrendingEncoder(plain_encoder, 3, 10) learners = [ SvmPredictor(stopworded_encoder, 'stopword_svm'), #SvmPredictor(plain_encoder, 'plain_svm'), #SvmPredictor(stopworded_trend_encoder3_10, 'stop_trend_3_10'), SvmPredictor(plain_trend_encoder3_10, 'plain_trend_3_10'), #VorpalCandPricePredictor(stopworded_encoder, 'stop_vow_all') ] for i in range(100): fake_learner = FakeLearner() learners.append(fake_learner) eval_stats = [EvaluationStats(learner, doc_reader) for learner in learners] training_data = [] labels = [] dates_to_skip = 5 for date in sorted(grouped_sentence_files): if date[-1] == '2' or date[-1] == '7': continue #if dates_to_skip <= -5: break print date for file_name in grouped_sentence_files[date]: cand = file_name[:file_name.find('.'):] cur, next = intrade_parser.get_cur_and_next_price(cand, date) change = next - cur training_data.append((cand, date)) labels.append(change) if dates_to_skip > 0: continue for learner, eval_stat in zip(learners, eval_stats): eval_stat.eval_prediction(change, cand, date) dates_to_skip -= 1 if dates_to_skip <= 0: for learner in learners: learner.fit(training_data, labels) coefs = [] print 'num features', feature_indexer.num_features() # print 'coef size', len(learners[0].learner.coef_[0]) # for ind in range(feature_indexer.num_features()): # coefs.append((feature_indexer.get_ith_feature_name(ind), # learners[0].learner.coef_[0][ind])) # coefs.sort(key = lambda x: -abs(x[1])) # for feature, weight in coefs[:500]: # print feature, weight eval_stats.sort(key=lambda x: -x.profit()) for ind, e in enumerate(eval_stats): if e.learner.name != 'fake': print e.learner.name, 'profit', ind, float(ind) / len(eval_stats), e.profit() eval_stats.sort(key=lambda x: -x.accuracy()) for ind, e in enumerate(eval_stats): if e.learner.name != 'fake': print e.learner.name, 'accuracy', ind, float(ind) / len(eval_stats), e.accuracy()