Beispiel #1
0
def get_id_cache():

    TDAY = datetime(DATE.year, DATE.month, DATE.day)
    file = Path(f"{DIR}/data/id_cache.json")

    if file.exists():

        with open(file, "r") as _file:
            id_cache = json.loads(_file.read())

        dates = list(id_cache.keys())
        for date in dates:

            dt = datetime.strptime(date, FMT)

            if (TDAY - dt).days >= 7:

                del id_cache[date]
                id_cache[SDATE] = []

    else:

        logger.warning(f"id cache does not exist")
        id_cache = {(TDAY - timedelta(days=i)).strftime(FMT): []
                    for i in range(7)}

    ids = set([_id for id_list in id_cache.values() for _id in id_list])

    return id_cache, ids
Beispiel #2
0
def collect_data_again(batch_id, faults):

    for i, ticker in enumerate(faults):

        try:

            retries = {
                key: key in faults[ticker]
                for key in ['analysis', 'keystats', 'ohlc', 'options'][:2]
            }

            ticker_obj = Ticker(ticker, logger, batch_id, retries,
                                faults[ticker])
            faults[ticker] = ticker_obj.fault_dict
            time.sleep(SLEEP)

            logger.info(f"{ticker},{batch_id},Re-Ticker,Success,")

        except Exception as e:

            logger.warning(f"{ticker},{batch_id},Re-Ticker,Failure,{e}")

        pct = (i + 1) / len(faults)
        pct = np.round(100 * pct, 4)
        logger.info(f"SCRAPER,{batch_id},RE-PROGRESS,{pct}%,")

    return faults
Beispiel #3
0
def main():

    logger.info(f"RSS,Job,Initated,{SDATE}")

    for file in os.listdir(f"{DIR}/pids"):

        if file == ".gitignore":
            continue

        os.remove(f"{DIR}/pids/{file}")

    os.system(f"touch {DIR}/pids/{os.getpid()}")

    group_keys = list(groups.keys())
    parallel_groups = [group_keys[0::2], group_keys[1::2]]

    try:

        Parallel(n_jobs=2)(
            delayed(parallel_job)(job_id, parallel_group)
            for job_id, parallel_group in enumerate(parallel_groups))

    except Exception as e:

        logger.warning(e)
Beispiel #4
0
def splits():

	logger.info(f"SCRAPER,SPLITS,INITIATED,")

	now = datetime.now()
	report_df = pd.DataFrame()
	dt = datetime(now.year, now.month, 1).strftime("%m/%d/%Y")

	try:

		df = process(dt)
		store(df)

		_connector.execute(f"DELETE FROM stocksplitstmp{MODIFIER};")
		_connector.write(f"stocksplitstmp{MODIFIER}", df)
		_connector.execute("""
				INSERT IGNORE INTO
					stocksplits{modifier}
				SELECT
					*
				FROM
					stocksplitstmp{modifier};
			""".format(modifier=MODIFIER))

		df = df[df.ex_date == DATE]
		if len(df) != 0:

			logger.info(f"SCRAPER,SPLITS,ADJUSTING,{len(df)}")
			_connector.register_splits(P_COLUMNS, MODIFIER)
			_connector.adjust_splits(MODIFIER)
		
		metric = 1
		title_modifier = "SUCCESS"
		logger.info(f"SCRAPER,SPLITS,TERMINATED,{len(df)}")

	except Exception as e:

		metric = 0
		title_modifier = "FAILURE"
		logger.warning(f"SCRAPER,SPLITS,FAILURE,{e}")

	###############################################################################################

	report = _connector.read("""
			SELECT
				*
			FROM
				stocksplitstatus{modifier}
			WHERE
				ex_date = "{date}"
		""".format(modifier=MODIFIER, date=DATE))

	send_gcp_metric(CONFIG, "splits_success_indicator", "int64_value", metric)
	send_email(CONFIG, f"{title_modifier} - Stock Splits", report.to_html(), [], logger)
Beispiel #5
0
def parallel_job(job_id, parallel_group):

    logger.info(f"RSS,Job,PID,{os.getpid()}")

    def on_close():

        for group in parallel_group:

            feed_threads[group].on_close()
            logger.info(f"RSS,Thread,Closed,{job_id} - {group}")

    def sigterm_handler(signal_number, frame):

        logger.info(f"RSS,Job,SIGTERM,{os.getpid()}")
        on_close()

    signal.signal(signal.SIGTERM, sigterm_handler)
    os.system(f"touch {DIR}/pids/{os.getpid()}")

    ###############################################################################################

    try:

        feed_threads = {}

        for i, group in enumerate(parallel_group):

            group, sleep = group, groups[group]
            group_coords = feeds[feeds.source.isin(group)]

            feed_threads[group] = Feeds(sources=group_coords.source.values,
                                        feeds=group_coords.feed.values,
                                        sleep=sleep,
                                        logger=logger)

            feed_threads[group].start()

            logger.info(f"RSS,Thread,Initiated,{job_id} - {group}")

    except Exception as e:

        logger.warning(f"RSS,Thread,Error,{job_id} - {e}")

        on_close()

        raise Exception(f"RSS,Job,Terminated,{job_id} - {e}")
Beispiel #6
0
def collect_data(batch_id, tickers):

    for i, ticker in enumerate(tickers):

        try:

            Ticker(ticker, logger, batch_id)
            time.sleep(SLEEP)

            logger.info(f"{ticker},{batch_id},Ticker,Success,")

        except Exception as e:

            logger.warning(f"{ticker},{batch_id},Ticker,Failure,{e}")

        pct = (i + 1) / len(tickers)
        pct = np.round(100 * pct, 4)
        logger.info(f"SCRAPER,{batch_id},PROGRESS,{pct}%,")
Beispiel #7
0
def process(dt):

	tries, max_tries = 0, 5
	while tries < max_tries:

		try:

			df = pd.read_html(BASE.format(date=dt), attrs = {"class" : "datatable-component"})

			if len(df) != 1:
				raise Exception("Too Many Tables.")

			df = df[0].iloc[1:, 1:]
			df.columns = COLUMNS

			sf = df.split_factor.str
			sf = sf.split(":", expand=True).astype(float)

			df = df[~df.ticker.str.contains(":CA")]
			df['split_factor'] = sf[1] / sf[0]
			df['processed_timestamp'] = None

			for col in COLUMNS[-3:]:
				df[col] = pd.to_datetime(df[col]).astype(str)

			def multiply(group):
				group['split_factor'] = group.split_factor.product()
				return group.iloc[-1, :]

			df = df.groupby(["ticker", "ex_date"]).apply(multiply)
			df = df.reset_index(drop=True)

			return df

		except Exception as e:

			logger.warning(e)
			tries += 1

	if tries > max_tries:
		raise Exception("Too Many Tries.")
Beispiel #8
0
def fetch(query, id_cache, ids):

    url = URL.format(query=query.replace(' ', '+'))
    try:
        feed_entries = feedparser.parse(url)
    except Exception as e:
        logger.warning(f"collection error on {query}.")
        return

    items = []
    for item in feed_entries['entries']:

        article_source = item.get('source', {})
        article_source = article_source.get('title')

        if not article_source:
            continue

        if article_source not in news_sources:
            continue

        _id = item['id']
        if _id in ids:
            continue

        ids.add(_id)
        id_cache[SDATE].append(_id)

        item['acquisition_datetime'] = datetime.utcnow().isoformat()[:19]
        item['search_query'] = query
        item['_source'] = "google"
        item['_id'] = _id

        items.append(item)

    if len(items) == 0:
        return

    fname = str(uuid.uuid4())
    with open(PATH / f"{fname}.json", "w") as file:
        file.write(json.dumps(items))
Beispiel #9
0
def main():

    logger.info(f"SCRAPER,STORE,INITIATED,,")

    try:

        aggregate()
        compress()

        send_to_bucket(BUCKET_PREFIX,
                       BUCKET_NAME,
                       f"{DATE}.tar.xz",
                       f"{DIR}/financial_data",
                       logger=logger)

        remove()

        logger.info(f"SCRAPER,STORE,SUCCESS,,")

    except Exception as e:

        logger.warning(f"SCRAPER,STORE,FAILURE,{e},")

    logger.info(f"SCRAPER,STORE,TERMINATED,,")
Beispiel #10
0
def index_data(batch_id, tickers):

    try:

        # options, ohlc = [], []
        analysis, keystats = [], []

        # for file in (DATA/"options").iterdir():

        # 	ticker = file.name.split('_')[0]
        # 	if ticker not in tickers:
        # 		continue

        # 	options.append(pd.read_csv(file))

        # for file in (DATA/"ohlc").iterdir():

        # 	ticker = file.name.split('_')[0]
        # 	if ticker not in tickers:
        # 		continue

        # 	ohlc.append(pd.read_csv(file).iloc[:1, :])

        for file in (DATA / "analysis").iterdir():

            ticker = file.name.split('_')[0]
            if ticker not in tickers:
                continue

            analysis.append(pd.read_csv(file))

        for file in (DATA / "keystats").iterdir():

            ticker = file.name.split('_')[0]
            if ticker not in tickers:
                continue

            keystats.append(pd.read_csv(file))

        pre = _connector.get_equities_table_count().row_count

        # if len(options) > 0:
        # 	options = pd.concat(options)
        # 	_connector.write("options", options)

        # if len(ohlc) > 0:
        # 	ohlc = pd.concat(ohlc)
        # 	_connector.write("ohlc", ohlc)

        if len(analysis) > 0:
            _connector.write("analysis", pd.concat(analysis))

        if len(keystats) > 0:
            _connector.write("keystats", pd.concat(keystats))

        # if len(options) > 0 and len(ohlc) > 0:

        # 	cols = ["date_current", "ticker", "adjclose_price"]
        # 	options = options.merge(ohlc[cols], on=cols[:2], how="inner")
        # 	options = options.rename({"adjclose_price" : "stock_price"}, axis=1)
        # 	options = options.merge(CONFIG['ratemap'], on="days_to_expiry", how="inner")

        # 	zsurface, surface = calculate_surface(options, CONFIG['reg_expirations'])
        # 	zsurface['date_current'], surface['date_current'] = DATE, DATE

        # 	info = f"{zsurface.ticker.nunique()}/{options.ticker.nunique()}"
        # 	logger.info(f"SCRAPER,{batch_id},zSURFACE ({len(zsurface)}),{info}")

        # 	info = f"{surface.ticker.nunique()}/{options.ticker.nunique()}"
        # 	logger.info(f"SCRAPER,{batch_id},SURFACE ({len(surface)}),{info}")

        # 	_connector.write("zsurface", zsurface)
        # 	_connector.write("surface", surface)

        post = _connector.get_equities_table_count().row_count

        db_stats = (pre.tolist(), post.tolist())
        db_flag = 1

        logger.info(f"SCRAPER,{batch_id},INDEXING,SUCCESS,")

    except Exception as e:

        logger.warning(f"SCRAPER,{batch_id},INDEXING,FAILURE,{e}")
        print_exc()

        db_stats = ([0] * 4, [0] * 4)
        db_flag = 0

    return db_flag, db_stats
Beispiel #11
0
		raw_path = Path(f"{DIR}/news_data")
		files = list(raw_path.iterdir())
		files.remove(raw_path / ".gitignore")

		now = datetime.now()
		[
			file.unlink()
			for file in files
			if check_file(file, now)
		]

		n_items, n_unique = save_items(path, SDATE)
		send_metric(CONFIG, "clean_count", "int64_value", n_items)
		send_metric(CONFIG, "unique_clean_count", "int64_value", n_unique)

		send_to_bucket(
			CONFIG['GCP']['CLEAN_BUCKET'],
			'news',
			xz_file,
			logger=logger
		)

		logger.info(f"RSS save successeful.")
		send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 1)

	except Exception as e:

		logger.warning(f"RSS save failed. {e}, {format_exc()}")
		send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 0)
Beispiel #12
0
if __name__ == '__main__':

    init_folders()

    for ticker in TICKERS:

        try:
            get_news(ticker)
            logger.info('%s:Completed', ticker)
            ticker_list.append(ticker)
            current_complete = (len(ticker_list) / len(TICKERS)) * 100
            logger.info('Current Percentage: %f %s', current_complete, percent)

        except Exception as e:
            logger.warning('Error Message: %s:%s', ticker, e)
            continue

    percent_successful = (len(ticker_list) / len(TICKERS)) * 100
    logger.info('Percentage of successful tickers: %f  %s', percent_successful,
                percent)

# logging information #
log_path = "/home/zqretrace/scripts/merge_logs/CNBC_Merged_logs/merge_logs_CNBC.log"
logging.basicConfig(
    filename=log_path,
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.DEBUG)

source = 'CNBC'
Beispiel #13
0
        removed_curnames = get_diff(ocurnames, curnames)

        comnames.to_csv(f"{DIR}/data/company_names.csv", index=False)
        curnames.to_csv(f"{DIR}/data/curated_company_names.csv", index=False)

        body = '\n'.join([
            "New Company Names",
            new_comnames.to_html(index=False),
            "\nRemoved Company Names",
            removed_comnames.to_html(index=False),
            "\nNew Curated Names",
            new_curnames.to_html(index=False),
            "\nRemoved Curated Names",
            removed_curnames.to_html(index=False),
        ])

        n = new_comnames.shape[0] + removed_comnames.shape[0]
        n += new_curnames.shape[0] + removed_curnames.shape[0]
        if n > 0:
            send_email(CONFIG, "Company Name Summary", body, [], logger)

        logger.info("company name downloader & curator succesful")
        send_metric(CONFIG, metric, "int64_value", 1)

    except Exception as e:

        logger.warning(
            f"company name downloader & curator failed, {e}, {format_exc()}")
        send_metric(CONFIG, metric, "int64_value", 0)

    logger.info("company name downloader & curator terminated")
Beispiel #14
0
    else:
        raise Exception("TarFile Corrupted. File Size 0.")

    return raw_tar, cleaned_tar


if __name__ == '__main__':

    try:

        raw_tar, cleaned_tar = compress_files()

        send_to_bucket(CONFIG['gcp_bucket_prefix'],
                       CONFIG['gcp_bucket_name'],
                       os.path.basename(raw_tar),
                       os.path.dirname(raw_tar),
                       logger=logger)

        send_to_bucket(f"cleaned_{CONFIG['gcp_bucket_prefix']}",
                       CONFIG['gcp_bucket_name'],
                       os.path.basename(cleaned_tar),
                       os.path.dirname(cleaned_tar),
                       logger=logger)
        os.remove(cleaned_tar)

        logger.info(f"RSS,Storage,Success,")

    except Exception as e:

        logger.warning(f"RSS,Storage,Failure,{e}")
Beispiel #15
0
        send_to_bucket(CONFIG['GCP']['RAW_BUCKET'],
                       'news',
                       xz_file,
                       logger=logger)

        send_to_bucket(CONFIG['GCP']['RAW_VAULT'],
                       'news',
                       xz_file,
                       logger=logger)

    logger.info("sending metrics")
    send_metric(CONFIG, "news_count", "int64_value", n_items)
    send_metric(CONFIG, "unique_news_count", "int64_value", n_unique)


if __name__ == '__main__':

    logger.info("news job, initializing")

    try:

        main()
        send_metric(CONFIG, "news_success_indicator", "int64_value", 1)

    except Exception as e:

        exc = traceback.format_exc()
        logger.warning(f"news job error, {e}, {exc}")
        send_metric(CONFIG, "news_success_indicator", "int64_value", 0)

    logger.info("news job, terminating")
Beispiel #16
0
	r_map = df.iloc[-1, 1:].values
	r_map = np.array([0] + r_map.tolist())
	chs = CubicHermiteSpline(t_map, r_map, [0]*len(t_map))

	rm_df = pd.DataFrame()
	rm_df['days_to_expiry'] = np.arange(0, 365 * 10 + 1).astype(int)
	rm_df['rate'] = chs(rm_df.days_to_expiry.values)
	rm_df['date_current'] = DATE

	_connector.write("treasuryratemap", rm_df)

	return df

if __name__ == '__main__':

	try:

		df = collect()
		store()
		send_email(CONFIG, "Interest Rate Summary", df.to_html(), [], logger)
		metric = 1

	except Exception as e:

		logger.warning(e)
		body = f"<p>Process Failed. {e}</p>"
		send_email(CONFIG, "Interest Rate Summary - FAILED", body, [], logger)
		metric = 0

	send_gcp_metric(CONFIG, "rates_success_indicator", "int64_value", metric)
Beispiel #17
0
    for file in os.listdir(f"{DIR}/pids"):

        if file == ".gitignore":
            continue

        os.remove(f"{DIR}/pids/{file}")

    os.system(f"touch {DIR}/pids/{os.getpid()}")

    group_keys = list(groups.keys())
    parallel_groups = [group_keys[0::2], group_keys[1::2]]

    try:

        Parallel(n_jobs=2)(
            delayed(parallel_job)(job_id, parallel_group)
            for job_id, parallel_group in enumerate(parallel_groups))

    except Exception as e:

        logger.warning(e)


if __name__ == '__main__':

    try:
        main()
    except Exception as e:
        logger.warning(f"RSS process failed. {e}, {format_exc()}")
Beispiel #18
0
		db_flags.append(b_db_flag)
		db_stats.append(b_db_stats)

		success, failure = get_job_success_rates(tickers[ : BATCH_SIZE * (1 + batch_id)])
		send_metrics(success, failure)

		# if batch_id % checkpoint == 0 and batch_id != 0:
		# 	report("Partial", success, failure, faults_summary, db_flags, db_stats)

	###############################################################################################

	success, failure = get_job_success_rates(tickers)
	report("Full", success, failure, faults_summary, db_flags, db_stats)

	store()

	logger.info(f"SCRAPER,JOB,TERMINATED,{DATE},")

if __name__ == '__main__':

	try:
	
		send_gcp_metric(CONFIG, "oscrap_job_status", "int64_value", 1)
		main()
	
	except Exception as e:

		send_gcp_metric(CONFIG, "oscrap_job_status", "int64_value", 0)
		logger.warning(f"SCRAPER,JOB,MAIN ERROR,{e},")
Beispiel #19
0
def cleaning_loop():

    ctr = 0
    files = {NEWS_DIR / ".gitignore"}
    n_clean = len(list(CLEAN_DIR.iterdir()))

    while True:

        new_files = get_files(files)
        n_clean_new = len(list(CLEAN_DIR.iterdir()))

        if n_clean_new < n_clean:
            files = {NEWS_DIR / ".gitignore"}
            reload(sys.modules['clean_item'])
            reload(sys.modules['find_company_names'])
            logger.info("reloading the company names")

        items = []
        for new_file in new_files:
            with open(new_file, "r") as file:
                try:
                    items.extend(json.loads(file.read()))
                    files.add(new_file)
                except Exception as e:
                    logger.warning(f"File read error. {e}")

        new_items = []
        for item in items:
            if not item.get("title"):
                continue

            item = clean_item(item)

            dummy_item = {
                'title': item['title'],
                'article_source': item['article_source'],
                'published_datetime': item['published_datetime'][:10]
            }
            if 'summary' in item:
                dummy_item['summary'] = item['summary']

            _id = md5(json.dumps(dummy_item).encode()).hexdigest()
            new_items.append({
                "_index": "news",
                "_id": _id,
                "_op_type": "create",
                "_source": item
            })

        if len(new_items) > 50:
            new_items = filter(ES_CLIENT, new_items)

        if len(new_items) != 0:

            titles = [item['_source']['title'] for item in new_items]
            print(
                f"{datetime.now().isoformat()} - Scoring {len(new_items)} Files."
            )
            scores = get_scores(titles)

            for item, score in zip(new_items, scores):
                item['_source']['sentiment'] = score['prediction']
                item['_source']['sentiment_score'] = score['sentiment_score']
                item['_source']['abs_sentiment_score'] = abs(
                    score['sentiment_score'])

            successes, failures = helpers.bulk(ES_CLIENT,
                                               new_items,
                                               stats_only=True,
                                               raise_on_error=False)

            print(successes, failures)
            with open(CLEAN_DIR / f"{str(uuid.uuid4())}.json", "w") as file:
                file.write(json.dumps(new_items))

            new_items = []

        ###########################################################################################

        if ctr % 10 == 0:

            try:

                send_metric(CONFIG, "rss_counter", "int64_value",
                            len(list(NEWS_DIRS[0].iterdir())) - 1)
                ctr = 0

            except Exception as e:

                logger.warning(e)

        ###########################################################################################

        ctr += 1
        time.sleep(2)
        n_clean = n_clean_new