def cli(): parameters = next( get_pipeline_param_rows('arc-wikicommons-upload', 'arc-parameters.csv')) consts = next(get_pipeline_param_rows('constants', 'arc-parameters.csv')) if len(sys.argv) > 2: if sys.argv[2] == 'upload': cli_upload( consts, parameters, sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] != 'all' else None, 0 if len(sys.argv) > 3 and sys.argv[3] == 'all' else 1) if sys.argv[2] == 'upload-after': cli_upload( consts, parameters, sys.argv[3], upload_limit=int(sys.argv[4]) if len(sys.argv) > 4 else 1)
def get_resources(): for resource in resources: yield get_resource(resource) stats["num_resources"] += 1 log_successful_pipeline_execution( parameters["pipeline-id"], parameters["pipeline-log"], stats, next( get_pipeline_param_rows(parameters["pipeline-id"], parameters["pipeline-parameters"])))
def cli(): parameters = next( get_pipeline_param_rows('kkl-wikicommons-upload', 'kkl-parameters.csv')) consts = next(get_pipeline_param_rows('constants', 'kkl-parameters.csv')) if len(sys.argv) > 2: if sys.argv[2] == 'upload': if len(sys.argv) > 3: if sys.argv[3] == 'all': cli_upload(consts, parameters, None, 0) else: cli_upload(consts, parameters, sys.argv[3]) else: cli_upload(consts, parameters) if sys.argv[2] == 'upload-after': cli_upload( consts, parameters, sys.argv[3], upload_limit=int(sys.argv[4]) if len(sys.argv) > 4 else 1)
def temp_file(*args, **kwargs): with temp_dir(*args, **kwargs) as dir: file = os.path.join(dir, "temp") try: yield file except Exception: if os.path.exists(file): os.unlink(file) raise parameters, datapackage, resources = ingest() stats = {} aggregations = {"stats": stats} # parameters = next(get_pipeline_param_rows(parameters["pipeline-id"], parameters["pipeline-parameters"])) consts = next(get_pipeline_param_rows('constants', 'arc-parameters.csv')) def filter_resource(resource): logging.info("syncing to google storage bucket {}".format(consts["gcs_bucket"])) gcs = storage.Client.from_service_account_json(consts["gcs_secret_file"]) gcs_bucket = gcs.get_bucket(consts["gcs_bucket"]) for row_num, row in enumerate(resource, start=1): if gcs_bucket: input_file_path = row["gcs_url"].replace("gs://{}/".format(consts["gcs_bucket"]), "") _, ext = os.path.splitext(input_file_path) ext_stat = "total {} files".format(ext) stats.setdefault(ext_stat, 0) stats[ext_stat] += 1 logging.info("{}: {}".format(ext_stat, stats[ext_stat])) if ext == '.pdf':
from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resources import PROP_STREAMING from pyquery import PyQuery as pq from pipeline_params import get_pipeline_param_rows import logging, datetime, os from google.cloud import storage parameters, datapackage, __ = ingest() aggregations = {"stats": {}} parameters = next( get_pipeline_param_rows(parameters["pipeline-id"], parameters["pipeline-parameters"])) consts = next(get_pipeline_param_rows('constants', 'kkl-parameters.csv')) def get_resource(): logging.info("parsing pages {} to {}".format(parameters["first_page_num"], parameters["last_page_num"])) if consts.get("gcs_bucket"): # initialize google logging.info("syncing to google storage bucket {}".format( consts["gcs_bucket"])) gcs = storage.Client.from_service_account_json( consts["gcs_secret_file"]) gcs_bucket = gcs.get_bucket(consts["gcs_bucket"]) else: # initialize filesystem gcs, gcs_bucket = None, None for i in range(int(parameters["first_page_num"]), int(parameters["last_page_num"]) + 1): if i == 1:
from datapackage_pipelines.wrapper import ingest, spew from datapackage_pipelines.utilities.resources import PROP_STREAMING from pyquery import PyQuery as pq import datetime, logging, requests, json, time, os from pipeline_params import get_pipeline_param_rows from bs4 import BeautifulSoup source_parameters, datapackage, resources = ingest() stats = {} aggregations = {"stats": stats} parameters = next( get_pipeline_param_rows(source_parameters["pipeline-id"], source_parameters["pipeline-parameters"])) constants = next( get_pipeline_param_rows('constants', source_parameters['pipeline-parameters'])) session = requests.session() failed_items = [] def download(url, retry_num=0): time.sleep(float(constants["sleep-time"])) try: return session.get(url).content.decode() except Exception: if retry_num >= int(constants["max-retries"]): raise else: return download(url, retry_num + 1)
from selenium import webdriver from pipeline_params import get_pipeline_param_rows from pipeline_logs import log_successful_pipeline_execution import os, logging from google.cloud import storage logging.basicConfig(level=logging.INFO) params = next(get_pipeline_param_rows('kkl.py', 'kkl-parameters.csv')) consts = next(get_pipeline_param_rows('constants', 'kkl-parameters.csv')) stats = {"num_wrote_pages": 0} # initialize webdriver chrome_options = webdriver.ChromeOptions() prefs = {'download.default_directory': params['out_path']} chrome_options.add_experimental_option("prefs", prefs) chrome_options.add_argument('--headless') browser = webdriver.Chrome(chrome_options=chrome_options) # get the first page browser.get(params['first_page_url']) first_page_source = browser.page_source if consts.get("gcs_bucket"): # initialize google logging.info("syncing to google storage bucket {}".format( consts["gcs_bucket"])) gcs = storage.Client.from_service_account_json(consts["gcs_secret_file"]) gcs_bucket = gcs.get_bucket(consts["gcs_bucket"]) else: