def main(args): queries.connect(os.getenv("DB_URL")) try: if args.command == "producer": run_one_shot( data_getter=QueryGetter(queries.get_producers), processor=process_producer_item, data_saver=JsonItemSaver(), ) elif args.command == "publication": if args.published_at: day_start = args.published_at.timestamp() logger.info(day_start) day_end = day_start + 86400 data_getter = QueryGetter( queries. get_publications_by_producer_ranged_by_published_at, producer_id=of_uuid(args.producer), start=day_start, end=day_end, ) elif args.processed_at: logger.debug( "publications by %s processed between %s and %s", args.producer, args.processed_at[0], args.processed_at[1], ) data_getter = QueryGetter( queries. get_publications_by_producer_ranged_by_processed_at, producer_id=of_uuid(args.producer), start=args.processed_at[0].timestamp(), end=args.processed_at[1].timestamp() - 1, ) run_batch( data_getter=data_getter, processor=partial(process_publication_item, full_text=args.full_text), data_saver=JsonItemSaver(), ) else: raise RuntimeError(f"Unknown command '{args.command}'") return 0 except: logger.error(traceback.format_exc()) return -1 finally: queries.disconnect()
def publish_null_published_at(producer, output_dir, full_text: bool = False): output = Path(output_dir) / "no_date.jsonl" data_getter = QueryGetter( queries.get_publications_by_producer_with_null_published_at, producer_id=of_uuid(producer["producer_id"]), ) run_batch( data_getter=data_getter, processor=partial(process_publication_item, full_text=full_text), data_saver=JsonItemSaver(filename=output), )
def publish_one_day(published_at: date, producer, output_dir, full_text=False): output = output_dir / published_at.strftime("%Y-%m-%d.jsonl") data_getter = QueryGetter( queries.get_publications_by_producer_ranged_by_published_at, producer_id=of_uuid(producer["producer_id"]), start=of_datetime(day_start(published_at)), end=of_datetime(day_end(published_at)), ) run_batch( data_getter=data_getter, processor=partial(process_publication_item, full_text=full_text), data_saver=JsonItemSaver(filename=output), )
def main(args): queries.connect(os.getenv("DB_URL")) try: if args.command == "list": print( tabulate( [[ p["producer_id"], p["site_id"], p["name"], p["url"], p["first_seen_at"], p["last_updated_at"], p["data"]["identifiers"], ] for p in db.to_producers(queries.get_producers())], headers=[ "id", "site id", "name", "url", "first seen at", "last updated at", "identifiers", ], )) elif args.command == "show": for producer_id in args.id: data = queries.get_producer_with_stats( producer_id=db.of_uuid(producer_id)) print(tabulate(db.to_producer(data).items())) else: raise RuntimeError(f"Unknown command '{args.command}'") return 0 except: logger.error(traceback.format_exc()) return -1 finally: queries.disconnect()
def main(args): queries.connect(os.getenv("DB_URL")) try: if args.command == "producer": run_one_shot( data_getter=QueryGetter(queries.get_producers), processor=process_producer_item, data_saver=JsonItemSaver(), ) elif args.command == "publication": producer = to_producer( queries.get_producer(producer_id=of_uuid(args.producer))) if args.drive and args.drive != "local": row = queries.get_drive_by_name(name=args.drive) if row is None: raise RuntimeError(f"non-existent drive '{args.drive}'") gdrive = GoogleDrive(**row, service_account=args.service_account) if args.published_at: if not isinstance(args.published_at, Month): raise RuntimeError( "Google drive only stores monthly archive") publish_to_drive( drive=gdrive, producer=producer, published_at=args.published_at, full_text=args.full_text, ) elif args.processed_at: for ( row ) in queries.get_published_month_by_producer_ranged_by_processed_at( producer_id=of_uuid(args.producer), start=of_datetime( args.processed_at.start_datetime()), end=of_datetime(args.processed_at.end_datetime()), ): logger.debug("publishing %s", row["published_month"]) publish_to_drive( drive=gdrive, producer=producer, published_at=Month.fromisoformat( row["published_month"]) if row["published_month"] is not None else None, full_text=args.full_text, ) else: raise RuntimeError( "no --published-at or --processed-at specified") else: if args.published_at: data_getter = QueryGetter( queries. get_publications_by_producer_ranged_by_published_at, producer_id=of_uuid(args.producer), start=of_datetime(args.published_at.start_datetime()), end=of_datetime(args.published_at.end_datetime()), ) elif args.processed_at: logger.debug( "publications by %s processed between %s and %s", args.producer, args.processed_at.start, args.processed_at.end, ) data_getter = QueryGetter( queries. get_publications_by_producer_ranged_by_processed_at, producer_id=of_uuid(args.producer), start=of_datetime(args.processed_at.start_datetime()), end=of_datetime(args.processed_at.end_datetime()), ) else: raise RuntimeError( "no --published-at or --processed-at specified") run_batch( data_getter=data_getter, processor=partial(process_publication_item, full_text=args.full_text), data_saver=JsonItemSaver(), ) else: raise RuntimeError(f"unknown command '{args.command}'") return 0 except: logger.error(traceback.format_exc()) return -1 finally: queries.disconnect()
def main(args): parser_db = db.module("queries") parser_db.connect(os.getenv("DB_URL")) try: sc = parser_db.get_scraper_by_name(scraper_name=args.scraper_name) sc["data"] = db.to_json(sc["data"]) scraper_db = scraper.ScraperDb(sc["scraper_name"], os.getenv(sc["data"]["db_url_var"]), sc["data"]) if args.command == "producer": if args.id is not None: p = db.to_producer( parser_db.get_producer(producer_id=db.of_uuid(args.id))) data_getter = DbGetter(scraper_db, scraper.get_site, site_id=p["site_id"]) elif args.site_id is not None: data_getter = DbGetter(scraper_db, scraper.get_site, site_id=args.site_id) else: data_getter = DbGetter(scraper_db, scraper.get_sites) data_saver = (DbSaver(parser_db, producer.saver, scraper=sc) if not args.dump else JsonSaver()) run_one_shot( data_getter=data_getter, data_saver=data_saver, processor=producer.process_item, ) elif args.command == "publication": if args.id is not None: raise RuntimeError("Unimplemented") elif args.article_id is not None: data_getter = DbGetter( scraper_db, scraper.get_snapshots, article_id=args.article_id, first=args.first, ) elif args.url is not None: data_getter = DbGetter(scraper_db, scraper.get_snapshots, url=args.url, first=args.first) elif args.site_id is not None: data_getter = DbGetter( scraper_db, scraper.get_snapshots, site_id=args.site_id, first=args.first, ) elif args.update: raise RuntimeError("Unimplemented") else: data_getter = get_all_unprocessed_articles(scraper_db, parser_db, args=args) run_batch( data_getter=data_getter, data_saver=DbSaver(parser_db, publication.saver, scraper=sc) if not args.dump else JsonSaver(), processor=partial(publication.process_item, parser=args.parser), batch_size=1000, limit=args.limit, ) else: raise RuntimeError(f"Unknown command '{args.command}'") return 0 except: logger.error(traceback.format_exc()) return -1 finally: parser_db.disconnect()