def tfIdf( articles, keywordsFilePath = settings.KEYWORDS_FILEPATH ): """Returns a sparse tf-idf Matrix :param articles: An iterable of article strings. See :func:`cleaner.retrieve.getCleanArticles` :param keywordsFilePath: Path to *keywords.csv* :type keywordsFilePath: str """ keywordMap = keywords.getKeywordToIndexMap( keywordsFilePath ); counts = count.WordCounter( keywordMap )( articles ) return normalize.TfIdf()( counts )
columns = columns, default_fill_value = 0 ) def _getCountRows( args ): timestamp, aggregator, wordCounter = args try: date = timestamp.date() except AttributeError: date = timestamp counts = wordCounter( retrieve.getDailyArticles( date ) ) try: return ( date, aggregator( counts ) ) except TypeError: return ( date, counts ) if __name__ == "__main__": begin = datetime.date( 2011, 1, 3 ) end = datetime.date( 2013, 11, 27 ) keywordsFile = join( settings.KEYWORDS_DIR, 'splist.csv' ) tickerList = keywords.getTickerList( keywordsFile ) keywordsMap = keywords.getKeywordToIndexMap( keywordsFile ) empiricalDf = getEmpiricalDataFrame( tickerList, begin, end, retrieve.adjustedClosesFilepath( filename = 'cleanSP.csv' ) ) countDf = getCountDataFrame( tickerList, count.WordCounter( keywordsMap ), empiricalDf.index ) tfidf = normalize.TfIdf()( countDf ) empiricalDf = empiricalDf.ix[ tfidf.index ] tfidf.corr().to_csv( join( settings.RESULTS_DIR, 'hft_CountCorr.csv' ) ) empiricalDf.corr().to_csv( join( settings.RESULTS_DIR, 'hft_EmpCorr.csv' ) ) # corr.to_csv( join( settings.RESULTS_DIR, 'corrtest_withSent_all.csv' ) )
import sentiment from copy import deepcopy import numpy as np from sklearn.covariance import GraphLassoCV from sklearn import manifold, cluster from matplotlib.collections import LineCollection import pylab as pl ############################################################################## # Retrieve the data begin = datetime.date( 2011, 1, 3 ) end = datetime.date( 2013, 11, 27 ) tickerList = keywords.getTickerList() keywordsMap = keywords.getKeywordToIndexMap() sentCounter = count.SentimentWordCounter( keywordsMap, sentiment.classifier() ) mentionCounter = count.WordCounter( keywordsMap ) empiricalDf = matrices.getEmpiricalDataFrame( tickerList, begin, end )[ tickerList ] getTfIdf = lambda wordCounter, aggregator: normalize.TfIdf()( matrices.getCountDataFrame( tickerList, wordCounter, empiricalDf.index, aggregator = aggregator ) )[ tickerList ] tfIdfSentArticle = getTfIdf( sentCounter, None )[ tickerList ] tfIdfSentDay = getTfIdf( sentCounter, np.sum )[ tickerList ] tfIdfMentionArticle = getTfIdf( mentionCounter, None )[ tickerList ] tfIdfMentionDay = getTfIdf( mentionCounter, np.sum )[ tickerList ] matrices = { 'Empirical' : { 'By Day' : { 'Data' : empiricalDf } }, 'Signed Mentions' : { 'By Day' : { 'Data' : tfIdfSentDay },