Beispiel #1
0
def cli():
    parameters = next(
        get_pipeline_param_rows('arc-wikicommons-upload',
                                'arc-parameters.csv'))
    consts = next(get_pipeline_param_rows('constants', 'arc-parameters.csv'))
    if len(sys.argv) > 2:
        if sys.argv[2] == 'upload':
            cli_upload(
                consts, parameters, sys.argv[3]
                if len(sys.argv) > 3 and sys.argv[3] != 'all' else None,
                0 if len(sys.argv) > 3 and sys.argv[3] == 'all' else 1)
        if sys.argv[2] == 'upload-after':
            cli_upload(
                consts,
                parameters,
                sys.argv[3],
                upload_limit=int(sys.argv[4]) if len(sys.argv) > 4 else 1)
Beispiel #2
0
 def get_resources():
     for resource in resources:
         yield get_resource(resource)
         stats["num_resources"] += 1
     log_successful_pipeline_execution(
         parameters["pipeline-id"], parameters["pipeline-log"], stats,
         next(
             get_pipeline_param_rows(parameters["pipeline-id"],
                                     parameters["pipeline-parameters"])))
def cli():
    parameters = next(
        get_pipeline_param_rows('kkl-wikicommons-upload',
                                'kkl-parameters.csv'))
    consts = next(get_pipeline_param_rows('constants', 'kkl-parameters.csv'))
    if len(sys.argv) > 2:
        if sys.argv[2] == 'upload':
            if len(sys.argv) > 3:
                if sys.argv[3] == 'all':
                    cli_upload(consts, parameters, None, 0)
                else:
                    cli_upload(consts, parameters, sys.argv[3])
            else:
                cli_upload(consts, parameters)
        if sys.argv[2] == 'upload-after':
            cli_upload(
                consts,
                parameters,
                sys.argv[3],
                upload_limit=int(sys.argv[4]) if len(sys.argv) > 4 else 1)
def temp_file(*args, **kwargs):
    with temp_dir(*args, **kwargs) as dir:
        file = os.path.join(dir, "temp")
        try:
            yield file
        except Exception:
            if os.path.exists(file):
                os.unlink(file)
            raise


parameters, datapackage, resources = ingest()
stats = {}
aggregations = {"stats": stats}
# parameters = next(get_pipeline_param_rows(parameters["pipeline-id"], parameters["pipeline-parameters"]))
consts = next(get_pipeline_param_rows('constants', 'arc-parameters.csv'))


def filter_resource(resource):
    logging.info("syncing to google storage bucket {}".format(consts["gcs_bucket"]))
    gcs = storage.Client.from_service_account_json(consts["gcs_secret_file"])
    gcs_bucket = gcs.get_bucket(consts["gcs_bucket"])
    for row_num, row in enumerate(resource, start=1):
        if gcs_bucket:
            input_file_path = row["gcs_url"].replace("gs://{}/".format(consts["gcs_bucket"]), "")
            _, ext = os.path.splitext(input_file_path)
            ext_stat = "total {} files".format(ext)
            stats.setdefault(ext_stat, 0)
            stats[ext_stat] += 1
            logging.info("{}: {}".format(ext_stat, stats[ext_stat]))
            if ext == '.pdf':
Beispiel #5
0
from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resources import PROP_STREAMING
from pyquery import PyQuery as pq
from pipeline_params import get_pipeline_param_rows
import logging, datetime, os
from google.cloud import storage

parameters, datapackage, __ = ingest()
aggregations = {"stats": {}}
parameters = next(
    get_pipeline_param_rows(parameters["pipeline-id"],
                            parameters["pipeline-parameters"]))
consts = next(get_pipeline_param_rows('constants', 'kkl-parameters.csv'))


def get_resource():
    logging.info("parsing pages {} to {}".format(parameters["first_page_num"],
                                                 parameters["last_page_num"]))
    if consts.get("gcs_bucket"):
        # initialize google
        logging.info("syncing to google storage bucket {}".format(
            consts["gcs_bucket"]))
        gcs = storage.Client.from_service_account_json(
            consts["gcs_secret_file"])
        gcs_bucket = gcs.get_bucket(consts["gcs_bucket"])
    else:
        # initialize filesystem
        gcs, gcs_bucket = None, None
    for i in range(int(parameters["first_page_num"]),
                   int(parameters["last_page_num"]) + 1):
        if i == 1:
Beispiel #6
0
from datapackage_pipelines.wrapper import ingest, spew
from datapackage_pipelines.utilities.resources import PROP_STREAMING
from pyquery import PyQuery as pq
import datetime, logging, requests, json, time, os
from pipeline_params import get_pipeline_param_rows
from bs4 import BeautifulSoup

source_parameters, datapackage, resources = ingest()
stats = {}
aggregations = {"stats": stats}
parameters = next(
    get_pipeline_param_rows(source_parameters["pipeline-id"],
                            source_parameters["pipeline-parameters"]))
constants = next(
    get_pipeline_param_rows('constants',
                            source_parameters['pipeline-parameters']))

session = requests.session()
failed_items = []


def download(url, retry_num=0):
    time.sleep(float(constants["sleep-time"]))
    try:
        return session.get(url).content.decode()
    except Exception:
        if retry_num >= int(constants["max-retries"]):
            raise
        else:
            return download(url, retry_num + 1)
Beispiel #7
0
from selenium import webdriver
from pipeline_params import get_pipeline_param_rows
from pipeline_logs import log_successful_pipeline_execution
import os, logging
from google.cloud import storage

logging.basicConfig(level=logging.INFO)

params = next(get_pipeline_param_rows('kkl.py', 'kkl-parameters.csv'))
consts = next(get_pipeline_param_rows('constants', 'kkl-parameters.csv'))

stats = {"num_wrote_pages": 0}

# initialize webdriver
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': params['out_path']}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)

# get the first page
browser.get(params['first_page_url'])
first_page_source = browser.page_source

if consts.get("gcs_bucket"):
    # initialize google
    logging.info("syncing to google storage bucket {}".format(
        consts["gcs_bucket"]))
    gcs = storage.Client.from_service_account_json(consts["gcs_secret_file"])
    gcs_bucket = gcs.get_bucket(consts["gcs_bucket"])
else: