def get_id_cache(): TDAY = datetime(DATE.year, DATE.month, DATE.day) file = Path(f"{DIR}/data/id_cache.json") if file.exists(): with open(file, "r") as _file: id_cache = json.loads(_file.read()) dates = list(id_cache.keys()) for date in dates: dt = datetime.strptime(date, FMT) if (TDAY - dt).days >= 7: del id_cache[date] id_cache[SDATE] = [] else: logger.warning(f"id cache does not exist") id_cache = {(TDAY - timedelta(days=i)).strftime(FMT): [] for i in range(7)} ids = set([_id for id_list in id_cache.values() for _id in id_list]) return id_cache, ids
def collect_data_again(batch_id, faults): for i, ticker in enumerate(faults): try: retries = { key: key in faults[ticker] for key in ['analysis', 'keystats', 'ohlc', 'options'][:2] } ticker_obj = Ticker(ticker, logger, batch_id, retries, faults[ticker]) faults[ticker] = ticker_obj.fault_dict time.sleep(SLEEP) logger.info(f"{ticker},{batch_id},Re-Ticker,Success,") except Exception as e: logger.warning(f"{ticker},{batch_id},Re-Ticker,Failure,{e}") pct = (i + 1) / len(faults) pct = np.round(100 * pct, 4) logger.info(f"SCRAPER,{batch_id},RE-PROGRESS,{pct}%,") return faults
def main(): logger.info(f"RSS,Job,Initated,{SDATE}") for file in os.listdir(f"{DIR}/pids"): if file == ".gitignore": continue os.remove(f"{DIR}/pids/{file}") os.system(f"touch {DIR}/pids/{os.getpid()}") group_keys = list(groups.keys()) parallel_groups = [group_keys[0::2], group_keys[1::2]] try: Parallel(n_jobs=2)( delayed(parallel_job)(job_id, parallel_group) for job_id, parallel_group in enumerate(parallel_groups)) except Exception as e: logger.warning(e)
def splits(): logger.info(f"SCRAPER,SPLITS,INITIATED,") now = datetime.now() report_df = pd.DataFrame() dt = datetime(now.year, now.month, 1).strftime("%m/%d/%Y") try: df = process(dt) store(df) _connector.execute(f"DELETE FROM stocksplitstmp{MODIFIER};") _connector.write(f"stocksplitstmp{MODIFIER}", df) _connector.execute(""" INSERT IGNORE INTO stocksplits{modifier} SELECT * FROM stocksplitstmp{modifier}; """.format(modifier=MODIFIER)) df = df[df.ex_date == DATE] if len(df) != 0: logger.info(f"SCRAPER,SPLITS,ADJUSTING,{len(df)}") _connector.register_splits(P_COLUMNS, MODIFIER) _connector.adjust_splits(MODIFIER) metric = 1 title_modifier = "SUCCESS" logger.info(f"SCRAPER,SPLITS,TERMINATED,{len(df)}") except Exception as e: metric = 0 title_modifier = "FAILURE" logger.warning(f"SCRAPER,SPLITS,FAILURE,{e}") ############################################################################################### report = _connector.read(""" SELECT * FROM stocksplitstatus{modifier} WHERE ex_date = "{date}" """.format(modifier=MODIFIER, date=DATE)) send_gcp_metric(CONFIG, "splits_success_indicator", "int64_value", metric) send_email(CONFIG, f"{title_modifier} - Stock Splits", report.to_html(), [], logger)
def parallel_job(job_id, parallel_group): logger.info(f"RSS,Job,PID,{os.getpid()}") def on_close(): for group in parallel_group: feed_threads[group].on_close() logger.info(f"RSS,Thread,Closed,{job_id} - {group}") def sigterm_handler(signal_number, frame): logger.info(f"RSS,Job,SIGTERM,{os.getpid()}") on_close() signal.signal(signal.SIGTERM, sigterm_handler) os.system(f"touch {DIR}/pids/{os.getpid()}") ############################################################################################### try: feed_threads = {} for i, group in enumerate(parallel_group): group, sleep = group, groups[group] group_coords = feeds[feeds.source.isin(group)] feed_threads[group] = Feeds(sources=group_coords.source.values, feeds=group_coords.feed.values, sleep=sleep, logger=logger) feed_threads[group].start() logger.info(f"RSS,Thread,Initiated,{job_id} - {group}") except Exception as e: logger.warning(f"RSS,Thread,Error,{job_id} - {e}") on_close() raise Exception(f"RSS,Job,Terminated,{job_id} - {e}")
def collect_data(batch_id, tickers): for i, ticker in enumerate(tickers): try: Ticker(ticker, logger, batch_id) time.sleep(SLEEP) logger.info(f"{ticker},{batch_id},Ticker,Success,") except Exception as e: logger.warning(f"{ticker},{batch_id},Ticker,Failure,{e}") pct = (i + 1) / len(tickers) pct = np.round(100 * pct, 4) logger.info(f"SCRAPER,{batch_id},PROGRESS,{pct}%,")
def process(dt): tries, max_tries = 0, 5 while tries < max_tries: try: df = pd.read_html(BASE.format(date=dt), attrs = {"class" : "datatable-component"}) if len(df) != 1: raise Exception("Too Many Tables.") df = df[0].iloc[1:, 1:] df.columns = COLUMNS sf = df.split_factor.str sf = sf.split(":", expand=True).astype(float) df = df[~df.ticker.str.contains(":CA")] df['split_factor'] = sf[1] / sf[0] df['processed_timestamp'] = None for col in COLUMNS[-3:]: df[col] = pd.to_datetime(df[col]).astype(str) def multiply(group): group['split_factor'] = group.split_factor.product() return group.iloc[-1, :] df = df.groupby(["ticker", "ex_date"]).apply(multiply) df = df.reset_index(drop=True) return df except Exception as e: logger.warning(e) tries += 1 if tries > max_tries: raise Exception("Too Many Tries.")
def fetch(query, id_cache, ids): url = URL.format(query=query.replace(' ', '+')) try: feed_entries = feedparser.parse(url) except Exception as e: logger.warning(f"collection error on {query}.") return items = [] for item in feed_entries['entries']: article_source = item.get('source', {}) article_source = article_source.get('title') if not article_source: continue if article_source not in news_sources: continue _id = item['id'] if _id in ids: continue ids.add(_id) id_cache[SDATE].append(_id) item['acquisition_datetime'] = datetime.utcnow().isoformat()[:19] item['search_query'] = query item['_source'] = "google" item['_id'] = _id items.append(item) if len(items) == 0: return fname = str(uuid.uuid4()) with open(PATH / f"{fname}.json", "w") as file: file.write(json.dumps(items))
def main(): logger.info(f"SCRAPER,STORE,INITIATED,,") try: aggregate() compress() send_to_bucket(BUCKET_PREFIX, BUCKET_NAME, f"{DATE}.tar.xz", f"{DIR}/financial_data", logger=logger) remove() logger.info(f"SCRAPER,STORE,SUCCESS,,") except Exception as e: logger.warning(f"SCRAPER,STORE,FAILURE,{e},") logger.info(f"SCRAPER,STORE,TERMINATED,,")
def index_data(batch_id, tickers): try: # options, ohlc = [], [] analysis, keystats = [], [] # for file in (DATA/"options").iterdir(): # ticker = file.name.split('_')[0] # if ticker not in tickers: # continue # options.append(pd.read_csv(file)) # for file in (DATA/"ohlc").iterdir(): # ticker = file.name.split('_')[0] # if ticker not in tickers: # continue # ohlc.append(pd.read_csv(file).iloc[:1, :]) for file in (DATA / "analysis").iterdir(): ticker = file.name.split('_')[0] if ticker not in tickers: continue analysis.append(pd.read_csv(file)) for file in (DATA / "keystats").iterdir(): ticker = file.name.split('_')[0] if ticker not in tickers: continue keystats.append(pd.read_csv(file)) pre = _connector.get_equities_table_count().row_count # if len(options) > 0: # options = pd.concat(options) # _connector.write("options", options) # if len(ohlc) > 0: # ohlc = pd.concat(ohlc) # _connector.write("ohlc", ohlc) if len(analysis) > 0: _connector.write("analysis", pd.concat(analysis)) if len(keystats) > 0: _connector.write("keystats", pd.concat(keystats)) # if len(options) > 0 and len(ohlc) > 0: # cols = ["date_current", "ticker", "adjclose_price"] # options = options.merge(ohlc[cols], on=cols[:2], how="inner") # options = options.rename({"adjclose_price" : "stock_price"}, axis=1) # options = options.merge(CONFIG['ratemap'], on="days_to_expiry", how="inner") # zsurface, surface = calculate_surface(options, CONFIG['reg_expirations']) # zsurface['date_current'], surface['date_current'] = DATE, DATE # info = f"{zsurface.ticker.nunique()}/{options.ticker.nunique()}" # logger.info(f"SCRAPER,{batch_id},zSURFACE ({len(zsurface)}),{info}") # info = f"{surface.ticker.nunique()}/{options.ticker.nunique()}" # logger.info(f"SCRAPER,{batch_id},SURFACE ({len(surface)}),{info}") # _connector.write("zsurface", zsurface) # _connector.write("surface", surface) post = _connector.get_equities_table_count().row_count db_stats = (pre.tolist(), post.tolist()) db_flag = 1 logger.info(f"SCRAPER,{batch_id},INDEXING,SUCCESS,") except Exception as e: logger.warning(f"SCRAPER,{batch_id},INDEXING,FAILURE,{e}") print_exc() db_stats = ([0] * 4, [0] * 4) db_flag = 0 return db_flag, db_stats
raw_path = Path(f"{DIR}/news_data") files = list(raw_path.iterdir()) files.remove(raw_path / ".gitignore") now = datetime.now() [ file.unlink() for file in files if check_file(file, now) ] n_items, n_unique = save_items(path, SDATE) send_metric(CONFIG, "clean_count", "int64_value", n_items) send_metric(CONFIG, "unique_clean_count", "int64_value", n_unique) send_to_bucket( CONFIG['GCP']['CLEAN_BUCKET'], 'news', xz_file, logger=logger ) logger.info(f"RSS save successeful.") send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 1) except Exception as e: logger.warning(f"RSS save failed. {e}, {format_exc()}") send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 0)
if __name__ == '__main__': init_folders() for ticker in TICKERS: try: get_news(ticker) logger.info('%s:Completed', ticker) ticker_list.append(ticker) current_complete = (len(ticker_list) / len(TICKERS)) * 100 logger.info('Current Percentage: %f %s', current_complete, percent) except Exception as e: logger.warning('Error Message: %s:%s', ticker, e) continue percent_successful = (len(ticker_list) / len(TICKERS)) * 100 logger.info('Percentage of successful tickers: %f %s', percent_successful, percent) # logging information # log_path = "/home/zqretrace/scripts/merge_logs/CNBC_Merged_logs/merge_logs_CNBC.log" logging.basicConfig( filename=log_path, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG) source = 'CNBC'
removed_curnames = get_diff(ocurnames, curnames) comnames.to_csv(f"{DIR}/data/company_names.csv", index=False) curnames.to_csv(f"{DIR}/data/curated_company_names.csv", index=False) body = '\n'.join([ "New Company Names", new_comnames.to_html(index=False), "\nRemoved Company Names", removed_comnames.to_html(index=False), "\nNew Curated Names", new_curnames.to_html(index=False), "\nRemoved Curated Names", removed_curnames.to_html(index=False), ]) n = new_comnames.shape[0] + removed_comnames.shape[0] n += new_curnames.shape[0] + removed_curnames.shape[0] if n > 0: send_email(CONFIG, "Company Name Summary", body, [], logger) logger.info("company name downloader & curator succesful") send_metric(CONFIG, metric, "int64_value", 1) except Exception as e: logger.warning( f"company name downloader & curator failed, {e}, {format_exc()}") send_metric(CONFIG, metric, "int64_value", 0) logger.info("company name downloader & curator terminated")
else: raise Exception("TarFile Corrupted. File Size 0.") return raw_tar, cleaned_tar if __name__ == '__main__': try: raw_tar, cleaned_tar = compress_files() send_to_bucket(CONFIG['gcp_bucket_prefix'], CONFIG['gcp_bucket_name'], os.path.basename(raw_tar), os.path.dirname(raw_tar), logger=logger) send_to_bucket(f"cleaned_{CONFIG['gcp_bucket_prefix']}", CONFIG['gcp_bucket_name'], os.path.basename(cleaned_tar), os.path.dirname(cleaned_tar), logger=logger) os.remove(cleaned_tar) logger.info(f"RSS,Storage,Success,") except Exception as e: logger.warning(f"RSS,Storage,Failure,{e}")
send_to_bucket(CONFIG['GCP']['RAW_BUCKET'], 'news', xz_file, logger=logger) send_to_bucket(CONFIG['GCP']['RAW_VAULT'], 'news', xz_file, logger=logger) logger.info("sending metrics") send_metric(CONFIG, "news_count", "int64_value", n_items) send_metric(CONFIG, "unique_news_count", "int64_value", n_unique) if __name__ == '__main__': logger.info("news job, initializing") try: main() send_metric(CONFIG, "news_success_indicator", "int64_value", 1) except Exception as e: exc = traceback.format_exc() logger.warning(f"news job error, {e}, {exc}") send_metric(CONFIG, "news_success_indicator", "int64_value", 0) logger.info("news job, terminating")
r_map = df.iloc[-1, 1:].values r_map = np.array([0] + r_map.tolist()) chs = CubicHermiteSpline(t_map, r_map, [0]*len(t_map)) rm_df = pd.DataFrame() rm_df['days_to_expiry'] = np.arange(0, 365 * 10 + 1).astype(int) rm_df['rate'] = chs(rm_df.days_to_expiry.values) rm_df['date_current'] = DATE _connector.write("treasuryratemap", rm_df) return df if __name__ == '__main__': try: df = collect() store() send_email(CONFIG, "Interest Rate Summary", df.to_html(), [], logger) metric = 1 except Exception as e: logger.warning(e) body = f"<p>Process Failed. {e}</p>" send_email(CONFIG, "Interest Rate Summary - FAILED", body, [], logger) metric = 0 send_gcp_metric(CONFIG, "rates_success_indicator", "int64_value", metric)
for file in os.listdir(f"{DIR}/pids"): if file == ".gitignore": continue os.remove(f"{DIR}/pids/{file}") os.system(f"touch {DIR}/pids/{os.getpid()}") group_keys = list(groups.keys()) parallel_groups = [group_keys[0::2], group_keys[1::2]] try: Parallel(n_jobs=2)( delayed(parallel_job)(job_id, parallel_group) for job_id, parallel_group in enumerate(parallel_groups)) except Exception as e: logger.warning(e) if __name__ == '__main__': try: main() except Exception as e: logger.warning(f"RSS process failed. {e}, {format_exc()}")
db_flags.append(b_db_flag) db_stats.append(b_db_stats) success, failure = get_job_success_rates(tickers[ : BATCH_SIZE * (1 + batch_id)]) send_metrics(success, failure) # if batch_id % checkpoint == 0 and batch_id != 0: # report("Partial", success, failure, faults_summary, db_flags, db_stats) ############################################################################################### success, failure = get_job_success_rates(tickers) report("Full", success, failure, faults_summary, db_flags, db_stats) store() logger.info(f"SCRAPER,JOB,TERMINATED,{DATE},") if __name__ == '__main__': try: send_gcp_metric(CONFIG, "oscrap_job_status", "int64_value", 1) main() except Exception as e: send_gcp_metric(CONFIG, "oscrap_job_status", "int64_value", 0) logger.warning(f"SCRAPER,JOB,MAIN ERROR,{e},")
def cleaning_loop(): ctr = 0 files = {NEWS_DIR / ".gitignore"} n_clean = len(list(CLEAN_DIR.iterdir())) while True: new_files = get_files(files) n_clean_new = len(list(CLEAN_DIR.iterdir())) if n_clean_new < n_clean: files = {NEWS_DIR / ".gitignore"} reload(sys.modules['clean_item']) reload(sys.modules['find_company_names']) logger.info("reloading the company names") items = [] for new_file in new_files: with open(new_file, "r") as file: try: items.extend(json.loads(file.read())) files.add(new_file) except Exception as e: logger.warning(f"File read error. {e}") new_items = [] for item in items: if not item.get("title"): continue item = clean_item(item) dummy_item = { 'title': item['title'], 'article_source': item['article_source'], 'published_datetime': item['published_datetime'][:10] } if 'summary' in item: dummy_item['summary'] = item['summary'] _id = md5(json.dumps(dummy_item).encode()).hexdigest() new_items.append({ "_index": "news", "_id": _id, "_op_type": "create", "_source": item }) if len(new_items) > 50: new_items = filter(ES_CLIENT, new_items) if len(new_items) != 0: titles = [item['_source']['title'] for item in new_items] print( f"{datetime.now().isoformat()} - Scoring {len(new_items)} Files." ) scores = get_scores(titles) for item, score in zip(new_items, scores): item['_source']['sentiment'] = score['prediction'] item['_source']['sentiment_score'] = score['sentiment_score'] item['_source']['abs_sentiment_score'] = abs( score['sentiment_score']) successes, failures = helpers.bulk(ES_CLIENT, new_items, stats_only=True, raise_on_error=False) print(successes, failures) with open(CLEAN_DIR / f"{str(uuid.uuid4())}.json", "w") as file: file.write(json.dumps(new_items)) new_items = [] ########################################################################################### if ctr % 10 == 0: try: send_metric(CONFIG, "rss_counter", "int64_value", len(list(NEWS_DIRS[0].iterdir())) - 1) ctr = 0 except Exception as e: logger.warning(e) ########################################################################################### ctr += 1 time.sleep(2) n_clean = n_clean_new