def get_reuters_news_headlines(function_pointer,*args): from nlp import nltk_metricle import finviz firm_name = finviz.read_finviz_data('http://finviz.com/export.ashx?v=111&t='+args[0])['Company'][0].split()[0] headlines_list,abstract_list,time_stamp_list,anchor_list = function_pointer(*args) tokens_list_set = [set(nltk_metricle.find_features_from_POS(nltk_metricle.clean_document_return_features(headline,False),'NN','NNS','NNP','JJ','VB')) for headline in headlines_list] max_ = 1 headlines_clean_list = [] abstract_clean_list = [] timestamp_clean_list = [] for i in xrange(0,len(tokens_list_set)-1): flag_ = False for j in xrange(i+1,len(tokens_list_set)): if (len(tokens_list_set[i].intersection(tokens_list_set[j])) >= max_) and (firm_name in tokens_list_set[i]): max_ = len(tokens_list_set[i].intersection(tokens_list_set[j])) headlines_clean_list.append(headlines_list[j]) abstract_clean_list.append(abstract_list[j]) timestamp_clean_list.append(time_stamp_list[j]) flag_ = True if flag_ == True: headlines_clean_list.append(headlines_list[i]) abstract_clean_list.append(abstract_list[i]) timestamp_clean_list.append(time_stamp_list[i]) df = pd.DataFrame(columns=['ticker','date','headlines','abstract','timestamp'], index=[x for x in xrange(0,len(headlines_clean_list))]) df['ticker'] = args[0] #ticker df['date'] = args[1] #Date df['timestamp'] = timestamp_clean_list df['headlines'] = headlines_clean_list df['abstract'] = abstract_clean_list return df
def get_reuters_news_headlines(function_pointer,*args): #from nlp import nltk_metricle import finviz nltk_metricle = imp.load_source('nltk_metricle',os.path.join(root_directory,'nlp','nltk_metricle.py')) firm_name = finviz.read_finviz_data('http://finviz.com/export.ashx?v=111&t='+args[0])['Company'][0] tokenized_firm_name_set = set(nltk_metricle.create_tokens(firm_name)) headlines_list,abstract_list,time_stamp_list,anchor_list = function_pointer(*args) tokens_list_set = [set(nltk_metricle.find_features_from_POS(nltk_metricle.clean_document_return_features(headline,False),'NN','NNS','NNP','JJ','VB')) for headline in headlines_list] max_ = 1 headlines_clean_list = [] abstract_clean_list = [] timestamp_clean_list = [] keywords_list_list = [] for i in xrange(0,len(tokens_list_set)-1): flag_1 = False for j in xrange(i+1,len(tokens_list_set)): if (len(tokens_list_set[i].intersection(tokens_list_set[j])) >= max_) and (len(tokenized_firm_name_set.intersection(tokens_list_set[j])) > 0): max_ = len(tokens_list_set[i].intersection(tokens_list_set[j])) headlines_clean_list.append(headlines_list[j]) abstract_clean_list.append(abstract_list[j]) timestamp_clean_list.append(time_stamp_list[j]) keywords_list_list.append(nltk_metricle.find_features_from_POS(list(tokens_list_set[j]))) flag_1 = True if flag_1==True: headlines_clean_list.append(headlines_list[i]) abstract_clean_list.append(abstract_list[i]) timestamp_clean_list.append(time_stamp_list[i]) keywords_list_list.append(nltk_metricle.find_features_from_POS(list(tokens_list_set[i]))) df = pd.DataFrame(columns=['ticker','date','headlines','abstract','timestamp'], index=[x for x in xrange(0,len(headlines_clean_list))]) df['ticker'] = args[0] #ticker df['date'] = args[1] #Date df['timestamp'] = timestamp_clean_list df['headlines'] = headlines_clean_list df['abstract'] = abstract_clean_list df['keywords'] = keywords_list_list return df