Esempio n. 1
0
def main(args):
    queries.connect(os.getenv("DB_URL"))
    try:
        if args.command == "producer":
            run_one_shot(
                data_getter=QueryGetter(queries.get_producers),
                processor=process_producer_item,
                data_saver=JsonItemSaver(),
            )
        elif args.command == "publication":
            if args.published_at:
                day_start = args.published_at.timestamp()
                logger.info(day_start)
                day_end = day_start + 86400
                data_getter = QueryGetter(
                    queries.
                    get_publications_by_producer_ranged_by_published_at,
                    producer_id=of_uuid(args.producer),
                    start=day_start,
                    end=day_end,
                )
            elif args.processed_at:
                logger.debug(
                    "publications by %s processed between %s and %s",
                    args.producer,
                    args.processed_at[0],
                    args.processed_at[1],
                )
                data_getter = QueryGetter(
                    queries.
                    get_publications_by_producer_ranged_by_processed_at,
                    producer_id=of_uuid(args.producer),
                    start=args.processed_at[0].timestamp(),
                    end=args.processed_at[1].timestamp() - 1,
                )
            run_batch(
                data_getter=data_getter,
                processor=partial(process_publication_item,
                                  full_text=args.full_text),
                data_saver=JsonItemSaver(),
            )
        else:
            raise RuntimeError(f"Unknown command '{args.command}'")
        return 0
    except:
        logger.error(traceback.format_exc())
        return -1
    finally:
        queries.disconnect()
Esempio n. 2
0
def publish_null_published_at(producer, output_dir, full_text: bool = False):
    output = Path(output_dir) / "no_date.jsonl"
    data_getter = QueryGetter(
        queries.get_publications_by_producer_with_null_published_at,
        producer_id=of_uuid(producer["producer_id"]),
    )
    run_batch(
        data_getter=data_getter,
        processor=partial(process_publication_item, full_text=full_text),
        data_saver=JsonItemSaver(filename=output),
    )
Esempio n. 3
0
def publish_one_day(published_at: date, producer, output_dir, full_text=False):
    output = output_dir / published_at.strftime("%Y-%m-%d.jsonl")
    data_getter = QueryGetter(
        queries.get_publications_by_producer_ranged_by_published_at,
        producer_id=of_uuid(producer["producer_id"]),
        start=of_datetime(day_start(published_at)),
        end=of_datetime(day_end(published_at)),
    )
    run_batch(
        data_getter=data_getter,
        processor=partial(process_publication_item, full_text=full_text),
        data_saver=JsonItemSaver(filename=output),
    )
Esempio n. 4
0
def main(args):
    queries.connect(os.getenv("DB_URL"))
    try:
        if args.command == "list":
            print(
                tabulate(
                    [[
                        p["producer_id"],
                        p["site_id"],
                        p["name"],
                        p["url"],
                        p["first_seen_at"],
                        p["last_updated_at"],
                        p["data"]["identifiers"],
                    ] for p in db.to_producers(queries.get_producers())],
                    headers=[
                        "id",
                        "site id",
                        "name",
                        "url",
                        "first seen at",
                        "last updated at",
                        "identifiers",
                    ],
                ))
        elif args.command == "show":
            for producer_id in args.id:
                data = queries.get_producer_with_stats(
                    producer_id=db.of_uuid(producer_id))
                print(tabulate(db.to_producer(data).items()))
        else:
            raise RuntimeError(f"Unknown command '{args.command}'")
        return 0
    except:
        logger.error(traceback.format_exc())
        return -1
    finally:
        queries.disconnect()
Esempio n. 5
0
def main(args):
    queries.connect(os.getenv("DB_URL"))
    try:
        if args.command == "producer":
            run_one_shot(
                data_getter=QueryGetter(queries.get_producers),
                processor=process_producer_item,
                data_saver=JsonItemSaver(),
            )
        elif args.command == "publication":
            producer = to_producer(
                queries.get_producer(producer_id=of_uuid(args.producer)))
            if args.drive and args.drive != "local":
                row = queries.get_drive_by_name(name=args.drive)
                if row is None:
                    raise RuntimeError(f"non-existent drive '{args.drive}'")
                gdrive = GoogleDrive(**row,
                                     service_account=args.service_account)
                if args.published_at:
                    if not isinstance(args.published_at, Month):
                        raise RuntimeError(
                            "Google drive only stores monthly archive")
                    publish_to_drive(
                        drive=gdrive,
                        producer=producer,
                        published_at=args.published_at,
                        full_text=args.full_text,
                    )
                elif args.processed_at:
                    for (
                            row
                    ) in queries.get_published_month_by_producer_ranged_by_processed_at(
                            producer_id=of_uuid(args.producer),
                            start=of_datetime(
                                args.processed_at.start_datetime()),
                            end=of_datetime(args.processed_at.end_datetime()),
                    ):
                        logger.debug("publishing %s", row["published_month"])
                        publish_to_drive(
                            drive=gdrive,
                            producer=producer,
                            published_at=Month.fromisoformat(
                                row["published_month"])
                            if row["published_month"] is not None else None,
                            full_text=args.full_text,
                        )
                else:
                    raise RuntimeError(
                        "no --published-at or --processed-at specified")
            else:
                if args.published_at:
                    data_getter = QueryGetter(
                        queries.
                        get_publications_by_producer_ranged_by_published_at,
                        producer_id=of_uuid(args.producer),
                        start=of_datetime(args.published_at.start_datetime()),
                        end=of_datetime(args.published_at.end_datetime()),
                    )
                elif args.processed_at:
                    logger.debug(
                        "publications by %s processed between %s and %s",
                        args.producer,
                        args.processed_at.start,
                        args.processed_at.end,
                    )
                    data_getter = QueryGetter(
                        queries.
                        get_publications_by_producer_ranged_by_processed_at,
                        producer_id=of_uuid(args.producer),
                        start=of_datetime(args.processed_at.start_datetime()),
                        end=of_datetime(args.processed_at.end_datetime()),
                    )
                else:
                    raise RuntimeError(
                        "no --published-at or --processed-at specified")
                run_batch(
                    data_getter=data_getter,
                    processor=partial(process_publication_item,
                                      full_text=args.full_text),
                    data_saver=JsonItemSaver(),
                )
        else:
            raise RuntimeError(f"unknown command '{args.command}'")
        return 0
    except:
        logger.error(traceback.format_exc())
        return -1
    finally:
        queries.disconnect()
Esempio n. 6
0
def main(args):
    parser_db = db.module("queries")
    parser_db.connect(os.getenv("DB_URL"))
    try:
        sc = parser_db.get_scraper_by_name(scraper_name=args.scraper_name)
        sc["data"] = db.to_json(sc["data"])
        scraper_db = scraper.ScraperDb(sc["scraper_name"],
                                       os.getenv(sc["data"]["db_url_var"]),
                                       sc["data"])

        if args.command == "producer":
            if args.id is not None:
                p = db.to_producer(
                    parser_db.get_producer(producer_id=db.of_uuid(args.id)))
                data_getter = DbGetter(scraper_db,
                                       scraper.get_site,
                                       site_id=p["site_id"])
            elif args.site_id is not None:
                data_getter = DbGetter(scraper_db,
                                       scraper.get_site,
                                       site_id=args.site_id)
            else:
                data_getter = DbGetter(scraper_db, scraper.get_sites)

            data_saver = (DbSaver(parser_db, producer.saver, scraper=sc)
                          if not args.dump else JsonSaver())
            run_one_shot(
                data_getter=data_getter,
                data_saver=data_saver,
                processor=producer.process_item,
            )

        elif args.command == "publication":
            if args.id is not None:
                raise RuntimeError("Unimplemented")
            elif args.article_id is not None:
                data_getter = DbGetter(
                    scraper_db,
                    scraper.get_snapshots,
                    article_id=args.article_id,
                    first=args.first,
                )
            elif args.url is not None:
                data_getter = DbGetter(scraper_db,
                                       scraper.get_snapshots,
                                       url=args.url,
                                       first=args.first)
            elif args.site_id is not None:
                data_getter = DbGetter(
                    scraper_db,
                    scraper.get_snapshots,
                    site_id=args.site_id,
                    first=args.first,
                )
            elif args.update:
                raise RuntimeError("Unimplemented")
            else:
                data_getter = get_all_unprocessed_articles(scraper_db,
                                                           parser_db,
                                                           args=args)
            run_batch(
                data_getter=data_getter,
                data_saver=DbSaver(parser_db, publication.saver, scraper=sc)
                if not args.dump else JsonSaver(),
                processor=partial(publication.process_item,
                                  parser=args.parser),
                batch_size=1000,
                limit=args.limit,
            )
        else:
            raise RuntimeError(f"Unknown command '{args.command}'")
        return 0
    except:
        logger.error(traceback.format_exc())
        return -1
    finally:
        parser_db.disconnect()