Exemple #1
0
def tfIdf( articles, keywordsFilePath = settings.KEYWORDS_FILEPATH ):
    """Returns a sparse tf-idf Matrix
    
    :param articles: An iterable of article strings. See :func:`cleaner.retrieve.getCleanArticles`
    :param keywordsFilePath: Path to *keywords.csv*
    :type keywordsFilePath: str
    """
    keywordMap = keywords.getKeywordToIndexMap( keywordsFilePath );
    counts = count.WordCounter( keywordMap )( articles )
    return normalize.TfIdf()( counts )
Exemple #2
0
                               columns = columns,
                               default_fill_value = 0 )

def _getCountRows( args ):
    timestamp, aggregator, wordCounter = args
    try:
        date = timestamp.date()
    except AttributeError:
        date = timestamp
    counts = wordCounter( retrieve.getDailyArticles( date ) )
    try:
        return ( date, aggregator( counts ) )
    except TypeError:
        return ( date, counts )
    
if __name__ == "__main__":    
    begin = datetime.date( 2011, 1, 3 )
    end = datetime.date( 2013, 11, 27 )
    keywordsFile = join( settings.KEYWORDS_DIR, 'splist.csv' )
    tickerList = keywords.getTickerList( keywordsFile )
    keywordsMap = keywords.getKeywordToIndexMap( keywordsFile )
    empiricalDf = getEmpiricalDataFrame( tickerList, begin, end, retrieve.adjustedClosesFilepath( filename = 'cleanSP.csv' ) )
    countDf = getCountDataFrame( tickerList,
                                 count.WordCounter( keywordsMap ),
                                 empiricalDf.index )
    tfidf = normalize.TfIdf()( countDf )
    empiricalDf = empiricalDf.ix[ tfidf.index ]
    tfidf.corr().to_csv( join( settings.RESULTS_DIR, 'hft_CountCorr.csv' ) )
    empiricalDf.corr().to_csv( join( settings.RESULTS_DIR, 'hft_EmpCorr.csv' ) )
    # corr.to_csv( join( settings.RESULTS_DIR, 'corrtest_withSent_all.csv' ) )
Exemple #3
0
import sentiment

from copy import deepcopy

import numpy as np
from sklearn.covariance import GraphLassoCV
from sklearn import manifold, cluster
from matplotlib.collections import LineCollection
import pylab as pl

##############################################################################
# Retrieve the data
begin = datetime.date( 2011, 1, 3 )
end = datetime.date( 2013, 11, 27 )
tickerList = keywords.getTickerList()
keywordsMap = keywords.getKeywordToIndexMap()
sentCounter = count.SentimentWordCounter( keywordsMap, sentiment.classifier() )
mentionCounter = count.WordCounter( keywordsMap )

empiricalDf = matrices.getEmpiricalDataFrame( tickerList, begin, end )[ tickerList ]
getTfIdf = lambda wordCounter, aggregator: normalize.TfIdf()( matrices.getCountDataFrame( tickerList, wordCounter, empiricalDf.index, aggregator = aggregator ) )[ tickerList ]
tfIdfSentArticle = getTfIdf( sentCounter, None )[ tickerList ]
tfIdfSentDay = getTfIdf( sentCounter, np.sum )[ tickerList ]
tfIdfMentionArticle = getTfIdf( mentionCounter, None )[ tickerList ]
tfIdfMentionDay = getTfIdf( mentionCounter, np.sum )[ tickerList ]

matrices = { 'Empirical' : { 
                            'By Day' : { 'Data' : empiricalDf } 
              },
             'Signed Mentions' : {
                            'By Day' : { 'Data' : tfIdfSentDay },