def traverseTree(tree): # get dependency tree global dfs_list if type(tree) == nltk.tree.Tree: dfs_list.append(tree.label()) else: dfs_list.append(tree) #print("tree:", tree) for subtree in tree: if type(subtree) == nltk.tree.Tree: traverseTree(subtree) else: dfs_list.append(subtree) DBconn = ctbcfxSQL() # preprocess of news data raw_data = DBconn.query(cols='*', table='zerohedge', condition_str='time > "2017-10-20 00:00:00"') fomc_news = pd.DataFrame(raw_data) fomc_news['content'] = fomc_news['content'].apply(lambda x: x.lower()) now = datetime.now() # load spacy model nlp = spacy.load('en') # load entity & sentiment dictionary entity_cb = pd.read_csv(r'C:\Users\z00013855\Desktop\treemodel\ctbc1026.csv', encoding='utf-8') #CTBC_mod5
import sys import os from lib.ctbcfxSQL import ctbcfxSQL import pandas as pd import numpy as np import pytz from datetime import datetime, timedelta DBconn = ctbcfxSQL() #global SQL connection #========= IMSERT FUNCTIONS ============= def cmod(tablename, filepath): print('start inserting file: "%s" to TABLE[%s]...' % (filepath, tablename)) with open(filepath, 'r', encoding='utf8') as f: i = 0 for line in f: if i > 0: line = line.replace('\n', '').replace('\r', '') data1 = str(line).split(',') word = data1[0] data2 = str(data1[1]).split('#') entity = data2[0] aspect = data2[1] DBconn.insert('entity', ['entity'], [entity]) DBconn.insert('aspect', ['entity', 'aspect'], [entity, aspect]) DBconn.insert('word', ['aspect', 'word'], [aspect, word]) i += 1
def main(): #initialize utilities print('[start mission!]\n--------------------------------') startTime = time.time() DBconn = ctbcfxSQL() #start crawlers & insert databases print('source: CNBC | crawler activated.') df = CNBC_Crawler(stoppage=25).execute() print('complete!') df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2)) df.dropna(axis=0, how='any', inplace=True) dflist = df.values.tolist() print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...") for d in dflist: d[1] = str(d[1]).split('+')[0] DBconn.insert('cnbc_us',['title','time','content'],d) print('complete!') print('source: Forex factory | crawler activated.') df = Forexfactory_Crawler().execute() print('complete!') df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2)) df.dropna(axis=0, how='any', inplace=True) dflist = df.values.tolist() print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...") for d in dflist: d[0] = str(d[0]).split('+')[0] DBconn.insert('forexfactory',['time','title','content'],d) print('complete!') print('source: Forex Live | crawler activated.') df = Forexlive_Crawler(stoppage=50).execute() print('complete!') df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2)) df.dropna(axis=0, how='any', inplace=True) dflist = df.values.tolist() print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...") for d in dflist: d[0] = str(d[0]).split('+')[0] DBconn.insert('forexlive',['time','author','title','content'],d) print('complete!') print('source: Reuters Market | crawler activated.') df = ReutersMarket_Crawler(stoppage=50).execute() print('complete!') df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2)) df.dropna(axis=0, how='any', inplace=True) dflist = df.values.tolist() print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...") for d in dflist: d[1] = str(d[1]).split('+')[0] DBconn.insert('reuters',['title','time','content'],d) print('complete!') print('source: Reuters World | crawler activated.') df = ReutersWorld_Crawler(stoppage=50).execute() print('complete!') df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2)) df.dropna(axis=0, how='any', inplace=True) dflist = df.values.tolist() print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...") for d in dflist: d[1] = str(d[1]).split('+')[0] DBconn.insert('reuters',['title','time','content'],d) print('complete!') print('source: Zerohedge | crawler activated.') df = Zerohedge_Crawler(stoppage=25).execute() print('complete!') df.dropna(axis=1, how='any', inplace=True, thresh=int(len(df.index)/2)) df.dropna(axis=0, how='any', inplace=True) dflist = df.values.tolist() print('Data inserting into MySQL (amount=%d)'%(len(dflist)),end="...") for d in dflist: d[1] = str(d[1]).split('+')[0] DBconn.insert('zerohedge',['title','time','content','tag'],d) print('complete!') #delete duplicates print('start deleting duplicates:\nCNBC running',end="...") DBconn.crawler_DelDup('cnbc_us') print('DONE!\nForex Factory running',end="...") DBconn.crawler_DelDup('forexfactory') print('DONE!\nForex Live running',end="...") DBconn.crawler_DelDup('forexlive') print('DONE!\nReuters running',end="...") DBconn.crawler_DelDup('reuters') print('DONE!\nZerohedge running',end="...") DBconn.crawler_DelDup('zerohedge') print('DONE!') #process end & evaluate time consumed endTime = time.time() elapsed = endTime - startTime print('------------------------\ntime elapsed: %dm %.1fs'%(int(elapsed/60),(elapsed%60.0))) print('[endTime="%s"]\n'%(time.ctime()))