def ind_perfs(): PSQL = db_connection('psql') current, next_id = det_cur_perf(PSQL) url = 'https://www.alphavantage.co/query?function=SECTOR&apikey=%s' % openfigi_key req = requests.request('GET', url) #if req.status_code == 404: # continue data = req.json() day_perf = data['Rank B: 1 Day Performance'] date = datetime.strptime(data['Meta Data']['Last Refreshed'].replace(' ET', ''), '%H:%M %p %m/%d/%Y') if np.datetime64(date) != current: communication = float(day_perf['Communication Services'].replace('%', '')) discretionary = float(day_perf['Consumer Discretionary'].replace('%', '')) staples = float(day_perf['Consumer Staples'].replace('%', '')) energy = float(day_perf['Energy'].replace('%', '')) financial = float(day_perf['Financials'].replace('%', '')) health = float(day_perf['Health Care'].replace('%', '')) industrial = float(day_perf['Industrials'].replace('%', '')) it = float(day_perf['Information Technology'].replace('%', '')) material = float(day_perf['Materials'].replace('%', '')) realestate = float(day_perf['Real Estate'].replace('%', '')) utilities = float(day_perf['Utilities'].replace('%', '')) script = "INSERT INTO portfolio.ind_perf(\ ind_perf_id, date, communication, discretionary, staples, energy, financial, health, industrial, it, material, realestate, utilities)\ VALUES (%i, '%s', %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f);" % (next_id, date, communication, discretionary, staples, energy, financial, health, industrial, it, material, realestate, utilities) pg_insert(PSQL.client, script)
def store_meta_res(domain): # domain = 'length' X, Y = pull_val_data(domain) pred_df = pd.DataFrame(Y) # res_df = pd.DataFrame() final_model_folder = os.path.join(cur_path, 'model_tuning', 'modelling', domain, 'final', 'models') for mod_name in os.listdir(final_model_folder): if mod_name == '.DS_Store': continue model_path = os.listdir(os.path.join(final_model_folder, mod_name)) model = load(os.path.join(final_model_folder, mod_name, model_path[0])) feats_folder = os.path.join(cur_path, 'model_tuning', 'modelling', domain, 'final', 'features') with open(os.path.join(feats_folder, '%s.json' % (mod_name)), 'r') as fp: feats = json.load(fp) feats = feats[max(feats.keys())] scale_folder = os.path.join(cur_path, 'model_tuning', 'modelling', domain, 'final', 'scalers', mod_name) scale_path = os.path.join(scale_folder, os.listdir(os.path.join(scale_folder))[0]) scale = load(scale_path) mod_preds = cross_validate(X[feats], Y, model, scale) mod_preds.rename(columns={0: mod_name}, inplace=True) pred_df = pred_df.join(mod_preds) pred_cols = [i for i in list(pred_df) if i != domain] mod_scores = {} for idx in pred_df.index: mod_scores[idx] = {} row = pred_df.loc[idx] for mod in pred_cols: if domain == 'winner': row_score = abs(row[domain] - row[mod]) elif domain == 'length': row_score = abs(row[domain] - row[mod]) mod_scores[idx][mod] = row_score mod_scores = pd.DataFrame.from_dict(mod_scores).T meta_data = mod_scores.join(X) PSQL = db_connection('psql') bouts = pg_query( PSQL.client, "select b.bout_id, weight_desc from ufc.bouts b join ufc.bout_results br on br.bout_id = b.bout_id join ufc.fights f on f.fight_id = b.fight_id join ufc.weights w on b.weight_id = w.weight_id" ) bouts.columns = ['bout_id', 'weight_id'] weights = pd.get_dummies(bouts['weight_id']) weights['index'] = bouts['bout_id'] weights.drop_duplicates(inplace=True) weights.set_index('index', inplace=True) meta_data = meta_data.join(weights) meta_data.to_csv( os.path.join(cur_path, 'data', 'meta', 'meta_%s.csv' % (domain)))
def scrape(): DB = _connections.db_connection('mongo') with open(os.path.join(cur_path, 'last_error.json')) as f: prev_progress = json.load(f) url = prev_progress['last_page'] # url = 'https://www.researchgate.net/search/publications?q=marine%2Bscience&page=1' browser = _connections.sel_scraper(headless=False) while True: try: browser.get(url) #, headers=head) sleep(randint(10, 100) / 10) tree = html.fromstring(browser.page_source) paper_links = tree.xpath( '//div[@class="react-container"]/div/div[2]/div[2]/div/div[2]/div/div[1]/div/div/div/div/div/div/div/div/div/div/a/@href' ) paper_links = [i.split('?')[0] for i in paper_links] if len(paper_links) == 0: DB.disconnect() browser.close() raise ValueError() except: with open(os.path.join(cur_path, 'last_error.json'), 'w') as fp: json.dump({'last_page': url}, fp) DB.disconnect() browser.close() raise IndexError() for p_link in paper_links: # dfas try: len(DB.client.find_one({'url_tag': p_link})) print('Article already archived, skipping') except: if 'book_review' in p_link.lower(): continue print(p_link) try: browser, DB = store_abstracts(p_link, browser, DB) except: print(p_link) with open(os.path.join(cur_path, 'last_error.json'), 'w') as fp: json.dump({'error_link': p_link, 'last_page': url}, fp) DB.disconnect() browser.close() raise ValueError() url = next_page(url)
def add_stock_ids(): PSQL = db_connection('psql') cur_stocks = _det_current_stocks(PSQL) url = 'https://api.robinhood.com/instruments/' data_dict = requests.get(url).json() cur_stocks = stock_loop(PSQL, data_dict['results'], cur_stocks) while True: url = data_dict['next'] data_dict = requests.get(url).json() cur_stocks = stock_loop(PSQL, data_dict['results'], cur_stocks) if data_dict['next'] is None: break
def update(): PSQL = db_connection('psql') next_update = floor_dt(datetime.now(), timedelta(minutes=30)) dt_until_update = next_update - datetime.now() seconds_before_update = dt_until_update.total_seconds() tm.sleep(seconds_before_update) BINANCE = binance_connection() NLU = NaturalLanguageUnderstandingV1(version=_config.nlu_credentials["version"], username=_config.nlu_credentials["username"], password=_config.nlu_credentials["password"]) NLU.set_default_headers({'x-watson-learning-opt-out' : "true"}) auth = tweepy.OAuthHandler(_config.twitter_key, _config.twitter_secret) TWITTER = tweepy.API(auth) page = requests.get('https://info.binance.com/en/all') tree = html.fromstring(page.content) COINS = [{'name':i, 'price':j} for i,j in zip(tree.xpath('//*[@id="__next"]/div/main/div/div/div/div[2]/div[1]/table/tbody/tr/td[2]/div/div/span[1]/text()'), tree.xpath('//*[@id="__next"]/div/main/div/div/div/div[2]/div[1]/table/tbody/tr/td[3]/div/div/text()'))] LAST_TIME = floor_dt(datetime.now(), timedelta(minutes=-30)) PRICES = BINANCE.get_all_tickers() pop_coins(PSQL, COINS) pop_markets(PSQL, PRICES) pop_times(PSQL, LAST_TIME) CURRENT_MKT_DATA, MRKT_CONV, TIME_CONV = conversion_data(PSQL) pop_exchanges(PSQL, LAST_TIME, PRICES, MRKT_CONV, CURRENT_MKT_DATA, TIME_CONV, BINANCE) tweet_data, coin_id_conv, use_coins = insert_twitter(PSQL, TWITTER, NLU) # nlu_insert(tweet_data[[0,1]], NLU, PSQL, coin_id_conv, use_coins) # pop_prices(PSQL, TIME_CONV, COINS) PSQL.disconnect() PSQL = None BINANCE = None COINS = None LAST_TIME = None PRICES = None CURRENT_MKT_DATA = None MRKT_CONV = None TIME_CONV = None TWITTER = None NLU = None tweet_data = None coin_id_conv = None use_coins = None
def dividends(full_update = False): print(' ~~ Dividends ~~ ') PSQL = db_connection('psql') id_sym = pg_query(PSQL.client, 'select rh_id, rh_sym from portfolio.stocks') current, next_id = det_cur_divs(PSQL) total_stocks = len(id_sym) for stock_num, (idx, sym) in enumerate(id_sym.values): progress(stock_num, total_stocks, status = sym) if not full_update and idx in current.keys(): continue url = 'https://api.iextrading.com/1.0/stock/%s/dividends/5y' % (sym) req = requests.request('GET', url) if req.status_code == 404: continue data = req.json() if len(data) == 0: continue for div in reversed(data): ex_date = datetime.strptime(div['exDate'], '%Y-%m-%d') if idx in current.keys() and ex_date <= current[idx]: continue if div['amount'] == '': continue amount = float(div['amount']) if div['paymentDate'] == '': continue payment_date = datetime.strptime(div['paymentDate'], '%Y-%m-%d') record_date = datetime.strptime(div['recordDate'], '%Y-%m-%d') if div['declaredDate'] == '': declared_date = 'null' else: declared_date = "'"+str(datetime.strptime(div['declaredDate'], '%Y-%m-%d'))+"'" script = "INSERT INTO portfolio.dividends(\ div_id, rh_id, ex_date, payment_date, record_date, declared_date, amount) \ VALUES (%i, '%s', '%s', '%s', '%s', %s, %.2f);" % (next_id, idx, ex_date, payment_date, record_date, declared_date, amount) pg_insert(PSQL.client, script) next_id += 1 current[idx] = ex_date ex_date = None payment_date = None record_date = None declared_date = None amount = None
def day_prices(): print(' ~~ Full Day Prices ~~ ') PSQL = db_connection('psql') current, next_id = det_cur_day_prices(PSQL, 'day') id_sym = pg_query(PSQL.client, 'select rh_id, rh_sym from portfolio.stocks') total_stocks = len(id_sym) trader = rs() login_data = trader.login() for stock_num, (idx, sym) in enumerate(id_sym.values): # if idx == '1d4d0780-ba27-4adc-ab12-0c3062fdf365': # asdfasdf progress(stock_num, total_stocks, status = sym) symbols = helper.inputs_to_set(sym) url = urls.historicals() payload = { 'symbols' : ','.join(symbols), 'interval' : 'day', 'span' : '5year', 'bounds' : 'regular'} data = trader.request_get(url,'results',payload) if data == [None]: continue for day in data[0]['historicals']: beg_date = datetime.strptime(day['begins_at'].replace('Z', '').replace('T', ' '), '%Y-%m-%d %H:%M:%S') if idx in current.keys() and beg_date <= current[idx]: continue open_price = float(day['open_price']) close_price = float(day['close_price']) high_price = float(day['high_price']) low_price = float(day['low_price']) volume = int(day['volume']) script = "INSERT INTO portfolio.day_prices(day_price_id, rh_id, date, open_price, close_price, high_price, low_price, volume) VALUES ('%s', '%s', '%s', %.2f, %.2f, %.2f, %.2f, %i);" % (next_id, idx, beg_date, open_price, close_price, high_price, low_price, volume) pg_insert(PSQL.client, script) next_id += 1 open_price = None close_price = None high_price = None low_price = None volume = None trader.logout()
def find_arb(): next_update = floor_dt(datetime.now(), timedelta(minutes=30)) dt_until_update = next_update - datetime.now() seconds_before_update = dt_until_update.total_seconds() PSQL_2 = db_connection('psql') BINANCE_2 = binance_connection() potentials = filter_potentials(PSQL_2) while seconds_before_update > 0: for coin2, coin1, _ in potentials: # coin2,coin1,_ = potentials[-1] prices, available_eth = cap_market(coin1, coin2, BINANCE_2) forward_tri = sim_tri(prices, [coin2, coin1], float(available_eth['free'])) if forward_tri: print('ETH -> %s -> %s -> ETH arbitrage: %.3f%%' % (coin2, coin1, 100 * (forward_tri[-1][-1] - forward_tri[0][-1]))) if forward_tri[-1][-1] > forward_tri[0][-1]: print(forward_tri) raise Exception('Found Profit!!!') backward_tri = sim_tri(prices, [coin1, coin2], float(available_eth['free'])) if backward_tri: print('ETH -> %s -> %s -> ETH arbitrage: %.3f%%' % (coin1, coin2, 100 * (backward_tri[-1][-1] - backward_tri[0][-1]))) if backward_tri[-1][-1] > backward_tri[0][-1]: print(backward_tri) raise Exception('Found Profit!!!') # tm.sleep(15) dt_until_update = next_update - datetime.now() seconds_before_update = dt_until_update.total_seconds() BINANCE_2 = None PSQL_2.disconnect() PSQL_2 = None prices = None coin1 = None coin2 = None forward_tri = None backward_tri = None next_update = None dt_until_update = None seconds_before_update = None
from joblib import load import pandas as pd from _connections import db_connection from db.pop_psql import pg_query from progress_bar import progress def conv_to_ml_odds(prob): if prob > .5: odds = (prob/(1-prob))*100 else: odds = ((1-prob)/prob)*100 return(odds) PSQL = db_connection('psql') alpha_data = pd.read_csv(os.path.join(cur_path, 'data', 'pred_data_winner_est_training.csv')) alpha_data = alpha_data.sort_values('bout_id').set_index(['bout_id', 'fighter_id']) X0 = alpha_data[[i for i in list(alpha_data) if i != 'winner']] Y0 = alpha_data['winner'].apply(lambda x: x if x == 1 else 0) beta_data = pd.read_csv(os.path.join(cur_path, 'data', 'pred_data_winner_ens_training.csv')) beta_data = beta_data.sort_values('bout_id').set_index(['bout_id', 'fighter_id']) X2 = beta_data[[i for i in list(beta_data) if i != 'winner']] Y2 = beta_data['winner'].apply(lambda x: x if x == 1 else 0) domain = 'winner' final_model_folder = os.path.join(cur_path, 'model_tuning', 'modelling', domain, 'final', 'models')
while cur_path.split('/')[-1] != 'binance': cur_path = os.path.abspath(os.path.join(cur_path, os.pardir)) sys.path.insert(1, os.path.join(cur_path, 'lib', 'python3.7', 'site-packages')) import requests from lxml import html from itertools import permutations from binance.enums import * from _connections import binance_connection, db_connection from datetime import datetime, timedelta import cryptocompare psql = db_connection('psql') page = requests.get('https://info.binance.com/en/all') tree = html.fromstring(page.content) coins = [{'name':i, 'price':j} for i,j in zip(tree.xpath('//*[@id="__next"]/div/main/div/div/div/div[2]/div[1]/table/tbody/tr/td[2]/div/div/span[1]/text()')[:50], tree.xpath('//*[@id="__next"]/div/main/div/div/div/div[2]/div[1]/table/tbody/tr/td[3]/div/div/text()')[:50])] client = binance_connection() prices = client.get_all_tickers() stable_coin = 'PAX' stable_len = len(stable_coin) stable_sells = {i['symbol'][:-stable_len]:float(i['price']) for i in prices if i['symbol'][stable_len:] == stable_coin} stable_buys = {i['symbol'][stable_len:]:float(i['price']) for i in prices if i['symbol'][:-stable_len] == stable_coin} def floor_dt(dt, delta): return dt + (datetime.min - dt) % delta
def financials(): PSQL = db_connection('psql') current, next_id = det_cur_fin(PSQL) id_sym = pg_query(PSQL.client, 'select rh_id, rh_sym from portfolio.stocks') total_stocks = len(id_sym) for stock_num, (idx, sym) in enumerate(id_sym.values): progress(stock_num, total_stocks, status = sym) url = 'https://api.iextrading.com/1.0/stock/%s/financials' % (sym) req = requests.request('GET', url) if req.status_code == 404: continue data = req.json() if len(data) == 0: continue for stock in reversed(data['financials']): report_date = datetime.strptime(stock['reportDate'], '%Y-%m-%d') if idx in current.keys() and current[idx] >= np.datetime64(report_date): continue if stock['grossProfit'] is None: gross_profit = 'null' else: gross_profit = int(stock['grossProfit']) if stock['costOfRevenue'] is None: cost_revenue = 'null' else: cost_revenue = int(stock['costOfRevenue']) if stock['operatingRevenue'] is None: operating_revenue = 'null' else: operating_revenue = int(stock['operatingRevenue']) if stock['totalRevenue'] is None: total_revenue = 'null' else: total_revenue = int(stock['totalRevenue']) if stock['operatingIncome'] is None: operating_income = 'null' else: operating_income = int(stock['operatingIncome']) if stock['netIncome'] is None: net_income = 'null' else: net_income = int(stock['netIncome']) if stock['researchAndDevelopment'] is None: r_d = 'null' else: r_d = int(stock['researchAndDevelopment']) if stock['operatingExpense'] is None: operating_expense = 'null' else: operating_expense = int(stock['operatingExpense']) if stock['currentAssets'] is None: current_assets = 'null' else: current_assets = int(stock['currentAssets']) if stock['totalAssets'] is None: total_assets = 'null' else: total_assets = int(stock['totalAssets']) if stock['totalLiabilities'] is None: total_liabilities = 'null' else: total_liabilities = int(stock['totalLiabilities']) if stock['currentCash'] is None: current_cash = 'null' else: current_cash = int(stock['currentCash']) if stock['currentDebt'] is None: current_debt = 'null' else: current_debt = int(stock['currentDebt']) if stock['totalCash'] is None: total_cash = 'null' else: total_cash = int(stock['totalCash']) if stock['totalDebt'] is None: total_debt = 'null' else: total_debt = int(stock['totalDebt']) if stock['shareholderEquity'] is None: shareholder_equity = 'null' else: shareholder_equity = int(stock['shareholderEquity']) if stock['cashChange'] is None: cash_change = 'null' else: cash_change = int(stock['cashChange']) if stock['cashFlow'] is None: cash_flow = 'null' else: cash_flow = int(stock['cashFlow']) if stock['operatingGainsLosses'] is None: operating_gl = 'null' else: operating_gl = int(stock['operatingGainsLosses']) script = "INSERT INTO portfolio.financials(\ financials_id, rh_id, report_date, gross_profit, cost_revenue, operating_revenue, total_revenue, operating_income, net_income, r_d, operating_expense, current_assets, total_assets, total_liabilities, current_cash, current_debt, total_cash, total_debt, shareholder_equity, cash_change, cash_flow, operating_gl)\ VALUES (%i, '%s', '%s', %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);" % (next_id, idx, report_date, gross_profit, cost_revenue, operating_revenue, total_revenue, operating_income, net_income, r_d, operating_expense, current_assets, total_assets, total_liabilities, current_cash, current_debt, total_cash, total_debt, shareholder_equity, cash_change, cash_flow, operating_gl) pg_insert(PSQL.client, script) next_id += 1 current[idx] = np.datetime64(report_date) report_date = None gross_profit = None cost_revenue = None operating_revenue = None total_revenue = None operating_income = None net_income = None r_d = None operating_expense = None current_assets = None total_assets = None total_liabilities = None current_cash = None current_debt = None total_cash = None total_debt = None shareholder_equity = None cash_change = None cash_flow = None operating_gl = None
inst = auth['institution'] if 'page' not in inst.keys(): continue inst['page'] = inst['page'].replace( 'https://www.researchgate.net/', '').lower() if inst['page'] not in cur_insts.keys(): nxt_inst, cur_insts = pop_inst(psql, nxt_inst, nxt_pap, cur_insts, inst) nxt_pap_inst = pop_pap_inst(psql, cur_insts[inst['page']], nxt_pap, nxt_pap_inst) def update_refs(mongo, psql): nxt_pap, cur_paps, psql = _det_current_(psql, 'papers') pg_create_table(psql.client, 'references') nxt_ref = 0 for entry in mongo.client.find({'abstract': {'$exists': True}}): if 'references' in entry.keys(): entry['url_tag'] = entry['url_tag'].lower() for ref in entry['references']: if ref.lower() in cur_paps.keys(): script = "insert into research_match.references (ref_id, cit_pap_id, ref_pap_id) VALUES (%i, %i, %i)" % ( nxt_ref, cur_paps[entry['url_tag']], cur_paps[ref.lower()]) pg_insert(psql.client, script) nxt_ref += 1 MONGO = _connections.db_connection('mongo') PSQL = _connections.db_connection('psql')
def add_refs(): DB = _connections.db_connection('mongo') browser = _connections.sel_scraper(headless=False) # total = DB.client.find({'references': {'$exists': False}, 'abstract': {'$exists': True}}).count() total = DB.client.find({ 'references': [], 'abstract': { '$exists': True } }).count() print('%i Documents need updating' % (total)) # for doc_num, (doc) in enumerate(DB.client.find({'references': {'$exists': False}, 'abstract': {'$exists': True}})): for doc_num, (doc) in enumerate( DB.client.find({ 'references': [], 'abstract': { '$exists': True } })): # if doc_num < 7: # continue # sdafaf # print(doc['url_tag']) url = 'https://www.researchgate.net/' + doc['url_tag'] sleep(randint(10, 100) / 25) browser.get(url) article_tree = html.fromstring(browser.page_source) if len(article_tree.xpath('//div[@class="temporarily-blocked"]')) > 0: DB.disconnect() browser.close() DB = None browser = None raise ValueError('Login Error') # login(browser, url) if len( article_tree.xpath( '//div[@class="captcha-container js-widgetContainer"]') ) > 0: DB.disconnect() browser.close() DB = None browser = None print('Captcha Activated') # return(False) raise ValueError('Captcha Activated') article_tree = html.fromstring(browser.page_source) if True: sleep(1) if len( browser.find_elements_by_xpath( '//div[@class="nova-c-nav__wrapper"]/div[@class="nova-c-nav__items"]/button' )) == 0: continue raise Exception('No References') article_tree = html.fromstring(browser.page_source) ref_tabs = article_tree.xpath( '//div[@class="nova-c-nav__wrapper"]/div[@class="nova-c-nav__items"]/button/span/div/text()' ) ref_tab = [i for i in ref_tabs if 'references' in i.lower()][0] sel_refs = True if len( article_tree.xpath( '//button[@class="nova-c-nav__item is-selected references js-lite-click" and ./span/div="%s"]' % (ref_tab))) > 0: sel_refs = False if len( article_tree.xpath( '//button[@class="nova-c-nav__item references js-lite-click is-selected" and ./span/div="%s"]' % (ref_tab))) > 0: sel_refs = False ref_button = browser.find_elements_by_xpath( '//button[./span/div="%s"]' % (ref_tab)) browser.execute_script("window.scrollTo(0, %i);" % (ref_button[0].location['y'] - 100)) if sel_refs: article_tree = html.fromstring(browser.page_source) if len( article_tree.xpath( '//button[@class="nova-c-nav__item is-selected references js-lite-click" and ./span/div="%s"]' % (ref_tab))) > 0: sel_refs = False elif len( article_tree.xpath( '//button[@class="nova-c-nav__item references js-lite-click is-selected" and ./span/div="%s"]' % (ref_tab))) > 0: sel_refs = False else: ref_button = browser.find_elements_by_xpath( '//button[./span/div="%s"]' % (ref_tab)) browser.execute_script( "window.scrollTo(0, %i);" % (ref_button[-1].location['y'] - 100)) ref_button[-1].click() sel_refs = False article_tree = html.fromstring(browser.page_source) init_see_more = len( article_tree.xpath('//button[./span="Show more"]')) see_more = len(article_tree.xpath('//button[./span="Show more"]')) num_fails = 0 while see_more == init_see_more: if see_more == 0: break sleep(1) load_fail = _load_more(browser) if load_fail: num_fails += 1 if num_fails > 20: DB.disconnect() browser.close() DB = None browser = None print('Ref Error') # return(False) raise IndexError('Ref Error') article_tree = html.fromstring(browser.page_source) see_more = len( article_tree.xpath('//button[./span="Show more"]')) article_tree = html.fromstring(browser.page_source) # article_tree = html.fromstring(browser.page_source) references = article_tree.xpath( '//div[@class="nova-v-publication-item__stack-item"]/div/a/@href') references = [i.split('?_sg')[0] for i in references] # references = article_tree.xpath('//div[@itemprop="citation"]/div/div/div/div/a/@href') DB.client.update_one({'_id': doc['_id']}, {'$set': { 'references': references }}) progress(doc_num + 1, total, status='%i' % (doc_num + 1))