コード例 #1
0
    def action(self, response):
        try:
            scraped_cluster = ClusterScraper(response)
        except Exception as e:
            utils.Report.critical(
                'Could not obtain cluster name! {}'.format(e))
            return

        # Save the metrics to Datadog
        for host_name, host_metrics in scraped_cluster.metrics.items():
            for resource_name, resource_metrics in host_metrics.items():
                for name, value in resource_metrics.items():
                    metric_name = 'compose.cluster.host.{}.{}'.format(
                        resource_name, name)
                    metric_unit = ClusterScraper.UNITS_FROM_NAME[name]
                    utils.send_metric(
                        metric_name=metric_name,
                        metric_description=metric_name + ' description',
                        metric_unit=metric_unit,
                        metric_value=value,
                        cluster_name=scraped_cluster.name,
                        host_name=host_name)

        yield dict(
            cluster_name=scraped_cluster.name,
            cluster_metrics=scraped_cluster.metrics)
コード例 #2
0
def main(date):

    logger.info("News Stats Initiated")
    datestr = (date - timedelta(days=1)).isoformat()[:10]
    file = Path(f"{DIR}/data/{datestr}.csv")
    xz_file = file.with_suffix(".tar.xz")

    try:

        logger.info(f"Processing stats for {date}")

        pre_n, df = process_data(get_data(date))
        df = create_sector_tickers(df, get_ticker_info())
        pre_n += len(sector_to_ticker)

        df['date'] = datestr
        df['volume'] = df.volume.astype(int)
        df = df[['date', 'ticker', 'volume', 'sentiment']]

        logger.info(
            f"Processed stats for {len(df)} tickers. Collected {pre_n} items.")
        send_metric(CONFIG, "news_stats_ticker_pre_count", "int64_value",
                    pre_n)
        send_metric(CONFIG, "news_stats_ticker_post_count", "int64_value",
                    len(df))

        if len(df) == 0:
            raise Exception("Zero tickers after filtering")

        df.to_csv(file, index=False)
        with tar.open(xz_file, "x:xz") as tar_file:
            tar_file.add(file, arcname=file.name)

        send_to_bucket("daily_news_stats", "", xz_file, logger=logger)

        os.unlink(file)
        os.unlink(xz_file)

        send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 1)

    except Exception as e:

        logger.info(f"News Stats Error - {e}")
        send_metric(CONFIG, "news_stats_success_indicator", "int64_value", 0)

    logger.info("News Stats Terminated")
コード例 #3
0
ファイル: save.py プロジェクト: zQuantz/NewsLab
		path = Path(f"{DIR}/clean_data")
		xz_file = Path(f"{DIR}/clean_data_backup/{SDATE}.tar.xz")

		raw_path = Path(f"{DIR}/news_data")
		files = list(raw_path.iterdir())
		files.remove(raw_path / ".gitignore")

		now = datetime.now()
		[
			file.unlink()
			for file in files
			if check_file(file, now)
		]

		n_items, n_unique = save_items(path, SDATE)
		send_metric(CONFIG, "clean_count", "int64_value", n_items)
		send_metric(CONFIG, "unique_clean_count", "int64_value", n_unique)

		send_to_bucket(
			CONFIG['GCP']['CLEAN_BUCKET'],
			'news',
			xz_file,
			logger=logger
		)

		logger.info(f"RSS save successeful.")
		send_metric(CONFIG, "clean_save_success_indicator", "int64_value", 1)

	except Exception as e:

		logger.warning(f"RSS save failed. {e}, {format_exc()}")
コード例 #4
0
ファイル: company_names.py プロジェクト: zQuantz/NewsLab
        removed_curnames = get_diff(ocurnames, curnames)

        comnames.to_csv(f"{DIR}/data/company_names.csv", index=False)
        curnames.to_csv(f"{DIR}/data/curated_company_names.csv", index=False)

        body = '\n'.join([
            "New Company Names",
            new_comnames.to_html(index=False),
            "\nRemoved Company Names",
            removed_comnames.to_html(index=False),
            "\nNew Curated Names",
            new_curnames.to_html(index=False),
            "\nRemoved Curated Names",
            removed_curnames.to_html(index=False),
        ])

        n = new_comnames.shape[0] + removed_comnames.shape[0]
        n += new_curnames.shape[0] + removed_curnames.shape[0]
        if n > 0:
            send_email(CONFIG, "Company Name Summary", body, [], logger)

        logger.info("company name downloader & curator succesful")
        send_metric(CONFIG, metric, "int64_value", 1)

    except Exception as e:

        logger.warning(
            f"company name downloader & curator failed, {e}, {format_exc()}")
        send_metric(CONFIG, metric, "int64_value", 0)

    logger.info("company name downloader & curator terminated")
コード例 #5
0
def main():

    company_names = pd.read_csv(f"{DIR}/../clean/data/company_names.csv")
    company_names = company_names[['ticker', 'name']]

    company_names = pd.concat([company_names, buzzwords])
    company_names = company_names.reset_index(drop=True)

    chunks = np.array_split(company_names, 5)
    id_cache, ids = get_id_cache()

    errors = mp.Queue()

    processes = [
        mp.Process(target=collect_news,
                   args=(job_id, chunk, id_cache, ids, errors))
        for job_id, chunk in enumerate(chunks)
    ]

    for process in processes:
        process.start()

    for process in processes:
        process.join()

    if not errors.empty():
        error = errors.get()
        raise Exception(error)

    ###############################################################################################

    for file in IDSDIR.iterdir():

        if file.name == '.gitignore':
            continue

        with open(file, "r") as _file:
            id_cache[SDATE].extend(json.loads(_file.read()))

    id_cache[SDATE] = list(set(id_cache[SDATE]))
    n_items = len(id_cache[SDATE])
    n_unique = n_items

    ids = set([_id for idlist in id_cache.values() for _id in idlist])

    with open(f"{DIR}/data/id_cache.json", "w") as file:
        file.write(json.dumps(id_cache))

    ###############################################################################################

    backups = os.listdir(f"{DIR}/news_data_backup")
    xz_file = Path(f"{DIR}/news_data_backup/{SDATE}.tar.xz")

    if datetime.now().hour >= 10 and not xz_file.exists():

        logger.info("news job, daily save")
        n_items, n_unique = save_items(PATH, SDATE)

        if gethostname() != CONFIG['MACHINE']['HOSTNAME']:
            CONFIG['GCP']['RAW_BUCKET'] = "tmp_items"
            CONFIG['GCP']['RAW_VAULT'] = "tmp_items_vault"

        send_to_bucket(CONFIG['GCP']['RAW_BUCKET'],
                       'news',
                       xz_file,
                       logger=logger)

        send_to_bucket(CONFIG['GCP']['RAW_VAULT'],
                       'news',
                       xz_file,
                       logger=logger)

    logger.info("sending metrics")
    send_metric(CONFIG, "news_count", "int64_value", n_items)
    send_metric(CONFIG, "unique_news_count", "int64_value", n_unique)
コード例 #6
0
        send_to_bucket(CONFIG['GCP']['RAW_BUCKET'],
                       'news',
                       xz_file,
                       logger=logger)

        send_to_bucket(CONFIG['GCP']['RAW_VAULT'],
                       'news',
                       xz_file,
                       logger=logger)

    logger.info("sending metrics")
    send_metric(CONFIG, "news_count", "int64_value", n_items)
    send_metric(CONFIG, "unique_news_count", "int64_value", n_unique)


if __name__ == '__main__':

    logger.info("news job, initializing")

    try:

        main()
        send_metric(CONFIG, "news_success_indicator", "int64_value", 1)

    except Exception as e:

        exc = traceback.format_exc()
        logger.warning(f"news job error, {e}, {exc}")
        send_metric(CONFIG, "news_success_indicator", "int64_value", 0)

    logger.info("news job, terminating")
コード例 #7
0
ファイル: clean.py プロジェクト: zQuantz/NewsLab
def cleaning_loop():

    ctr = 0
    files = {NEWS_DIR / ".gitignore"}
    n_clean = len(list(CLEAN_DIR.iterdir()))

    while True:

        new_files = get_files(files)
        n_clean_new = len(list(CLEAN_DIR.iterdir()))

        if n_clean_new < n_clean:
            files = {NEWS_DIR / ".gitignore"}
            reload(sys.modules['clean_item'])
            reload(sys.modules['find_company_names'])
            logger.info("reloading the company names")

        items = []
        for new_file in new_files:
            with open(new_file, "r") as file:
                try:
                    items.extend(json.loads(file.read()))
                    files.add(new_file)
                except Exception as e:
                    logger.warning(f"File read error. {e}")

        new_items = []
        for item in items:
            if not item.get("title"):
                continue

            item = clean_item(item)

            dummy_item = {
                'title': item['title'],
                'article_source': item['article_source'],
                'published_datetime': item['published_datetime'][:10]
            }
            if 'summary' in item:
                dummy_item['summary'] = item['summary']

            _id = md5(json.dumps(dummy_item).encode()).hexdigest()
            new_items.append({
                "_index": "news",
                "_id": _id,
                "_op_type": "create",
                "_source": item
            })

        if len(new_items) > 50:
            new_items = filter(ES_CLIENT, new_items)

        if len(new_items) != 0:

            titles = [item['_source']['title'] for item in new_items]
            print(
                f"{datetime.now().isoformat()} - Scoring {len(new_items)} Files."
            )
            scores = get_scores(titles)

            for item, score in zip(new_items, scores):
                item['_source']['sentiment'] = score['prediction']
                item['_source']['sentiment_score'] = score['sentiment_score']
                item['_source']['abs_sentiment_score'] = abs(
                    score['sentiment_score'])

            successes, failures = helpers.bulk(ES_CLIENT,
                                               new_items,
                                               stats_only=True,
                                               raise_on_error=False)

            print(successes, failures)
            with open(CLEAN_DIR / f"{str(uuid.uuid4())}.json", "w") as file:
                file.write(json.dumps(new_items))

            new_items = []

        ###########################################################################################

        if ctr % 10 == 0:

            try:

                send_metric(CONFIG, "rss_counter", "int64_value",
                            len(list(NEWS_DIRS[0].iterdir())) - 1)
                ctr = 0

            except Exception as e:

                logger.warning(e)

        ###########################################################################################

        ctr += 1
        time.sleep(2)
        n_clean = n_clean_new