Ejemplos de Scraper en Python, ejemplos de gssutils.Scraper en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: app.py Proyecto: GSS-Cogs/landingPage

def index():
    landingPage = request.args.get('landingPage')
    try:
        scraper = Scraper(landingPage)
        return markdown(scraper._repr_markdown_())
    except NotImplementedError as errNotImplemented:
        return markdown(
            f"# Not Implemented\n\n{errNotImplemented}{getPartialMatches(landingPage)}"
        )
    except MetadataError as errMetadata:
        return markdown(f"# Metadata Error\n\n{errMetadata}")
    except Exception as err:
        return markdown(f"# Error\n\n{err}\n\n{GENERIC_EXCEPTION_HELP}")

Ejemplo n.º 2

0

Mostrar archivo

Archivo: scrape.py Proyecto: GSS-Cogs/gss-utils

def step_impl(context, error_message):
    captured_error_message = ""
    try:
        context.scrapper = Scraper(seed=context.seed_path)
    except ValueError as err:
        captured_error_message = str(err)
    assert_equal(error_message, captured_error_message)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: scrape.py Proyecto: GSS-Cogs/gss-utils

def step_impl(context, uri):
    with vcr.use_cassette(
            cassette(uri),
            record_mode=context.config.userdata.get("record_mode",
                                                    DEFAULT_RECORD_MODE),
    ):
        context.scraper = Scraper(uri, requests.Session())

Ejemplo n.º 4

0

Mostrar archivo

def get_current_and_next_release_date() -> Tuple[datetime, datetime]:
    exports_scraper = Scraper(EXPORTS_DATASET_URL, session=requests.Session())
    imports_scraper = Scraper(IMPORTS_DATASET_URL, session=requests.Session())

    oldest_current_release_date: date = min(exports_scraper.dataset.issued,
                                            imports_scraper.dataset.issued)
    latest_update_date = max(exports_scraper.dataset.updateDueOn,
                             exports_scraper.dataset.updateDueOn)

    return (
        datetime(
            year=oldest_current_release_date.year,
            month=oldest_current_release_date.month,
            day=oldest_current_release_date.day,
            hour=0,
            minute=0,
            second=0,
        ),
        latest_update_date,
    )

Ejemplo n.º 5

0

Mostrar archivo

def get_scrape(seed_path):
    """Wrap the http get so we can use backoff"""
    printlog(f'Attempting scrape for seed {seed_path}')
    return Scraper(seed=seed_path)

Ejemplo n.º 6

0

Mostrar archivo

def step_impl(context):
    context.scrape = Scraper(context.url)

Ejemplo n.º 7

0

Mostrar archivo

def step_impl(context, file_name):
    feature_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
    seed_path = os.path.join(feature_path, "fixtures", file_name)
    context.scraper = Scraper(seed=seed_path)

Ejemplo n.º 8

0

Mostrar archivo

def step_impl(context, uri):
    with vcr.use_cassette('features/fixtures/scrape.yml',
                          record_mode=context.config.userdata.get('record_mode',
                                                                  DEFAULT_RECORD_MODE)):
        context.scraper = Scraper(uri, requests.Session())

Ejemplo n.º 9

0

Mostrar archivo

def process_data(flow_type, dataset_url):
    YEAR_MONTH_RE = re.compile(
        r'([0-9]{4})(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')

    # from https://stackoverflow.com/questions/597476/how-to-concisely-cascade-through-multiple-regex-statements-in-python
    class Re(object):
        def __init__(self):
            self.last_match = None

        def fullmatch(self, pattern, text):
            self.last_match = re.fullmatch(pattern, text)
            return self.last_match

    def time2period(t):
        gre = Re()
        if gre.fullmatch(YEAR_MONTH_RE, t):
            year, month = gre.last_match.groups()
            month_num = {
                'JAN': '01',
                'FEB': '02',
                'MAR': '03',
                'APR': '04',
                'MAY': '05',
                'JUN': '06',
                'JUL': '07',
                'AUG': '08',
                'SEP': '09',
                'OCT': '10',
                'NOV': '11',
                'DEC': '12',
            }.get(month)
            return f"{year}-{month_num}"
        else:
            print(f"no match for {t}")

    print(datetime.now(), f'process_data({flow_type}, {dataset_url}) start')
    scraper = Scraper(dataset_url, session=requests.Session())
    distribution = scraper.distribution(mediaType=lambda x: 'zip' in x,
                                        latest=True)

    with ZipFile(BytesIO(
            scraper.session.get(distribution.downloadURL).content)) as zip:
        assert len(zip.namelist()) == 1
        with zip.open(zip.namelist()[0]) as excelFile:
            table = pd.read_excel(
                excelFile,
                sheet_name=1,
                dtype={
                    'COMMODITY': 'category',
                    'COUNTRY': 'category',
                    'DIRECTION': 'category',
                },
                na_values=['', 'N/A'],
                keep_default_na=False,
            )

    print(datetime.now(), flow_type, "loaded dataframe")
    table.drop(columns='DIRECTION', inplace=True)
    table.rename(columns={
        'COMMODITY': 'Product',
        'COUNTRY': 'Geography'
    },
                 inplace=True)
    table = pd.melt(
        table,
        id_vars=['Product', 'Geography'],
        var_name='Period',
        value_name='Value',
    )
    print(datetime.now(), flow_type, "melted")
    table['Period'] = table['Period'].astype('category')
    product = table['Product'].str.split(' ', n=1, expand=True)
    table['Product Code'], table['Product Name'] = (
        product[0].astype('category'),
        product[1].astype('category'),
    )
    geography = table['Geography'].str.split(' ', n=1, expand=True)
    table['Geography Code'], table['Geography Name'] = (
        geography[0].astype('category'),
        geography[1].astype('category'),
    )
    table.drop(columns=['Product', 'Geography'], inplace=True)
    print(datetime.now(), flow_type, "dropped product+geography")

    table['Period'].cat.categories = table['Period'].cat.categories.map(
        time2period)
    table['Period Type'] = 'month'
    table['Period Type'] = table['Period Type'].astype('category')

    table['Seasonal Adjustment'] = pd.Series('NSA',
                                             index=table.index,
                                             dtype='category')
    table['Measure Type'] = pd.Series('gbp-total',
                                      index=table.index,
                                      dtype='category')
    table['Unit'] = pd.Series('gbp', index=table.index, dtype='category')
    table['Flow'] = pd.Series(flow_type, index=table.index, dtype='category')

    print(datetime.now(), flow_type, "starting apply")
    table['Marker'] = numpy.select(
        condlist=[table['Value'].isna(), table['Value'].dtype == 'str'],
        choicelist=['not-available', table['Value']],
        default='',
    )
    print(datetime.now(), flow_type, "finished apply")

    table['Marker'] = table['Marker'].astype('category')
    table['Value'] = pd.to_numeric(table['Value'], errors='coerce')
    table = table[[
        'Geography Code',
        'Geography Name',
        'Period',
        'Period Type',
        'Flow',
        'Product Code',
        'Product Name',
        'Seasonal Adjustment',
        'Measure Type',
        'Value',
        'Unit',
        'Marker',
    ]]
    table.rename(
        columns={
            'Geography Code': 'ons_iso_alpha_2_code',
            'Geography Name': 'ons_region_name',
            'Period': "period",
            'Period Type': "period_type",
            'Flow': "direction",
            'Product Code': "product_code",
            'Product Name': "product_name",
            'Seasonal Adjustment': "seasonal_adjustment",
            'Measure Type': "measure_type",
            'Value': "value",
            'Unit': "unit",
            'Marker': "marker",
        },
        inplace=True,
    )
    print(datetime.now(), f'process_data({flow_type}, {dataset_url}) finished',
          len(table))
    return table

Ejemplo n.º 10

0

Mostrar archivo

def process_data():
    print(datetime.now(), 'process_data start')

    YEAR_RE = re.compile(r'[0-9]{4}')
    YEAR_MONTH_RE = re.compile(
        r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
    YEAR_QUARTER_RE = re.compile(r'([0-9]{4})(Q[1-4])')

    def product(name):
        if 'Total Trade' in name:
            return 'goods-and-services'
        elif 'TiG' in name:
            return 'goods'
        elif 'TiS' in name:
            return 'services'
        raise ValueError(f'Unknown product type ${name}')

    class Re(object):
        def __init__(self):
            self.last_match = None

        def fullmatch(self, pattern, text):
            self.last_match = re.fullmatch(pattern, text)
            return self.last_match

    def time2periodtype(t):
        gre = Re()
        if gre.fullmatch(YEAR_RE, t):
            return "year"
        elif gre.fullmatch(YEAR_MONTH_RE, t):
            return "month"
        elif gre.fullmatch(YEAR_QUARTER_RE, t):
            return "quarter"
        else:
            print(f"no match for {t}")

    def time2period(t):
        gre = Re()
        if gre.fullmatch(YEAR_RE, t):
            return t
        elif gre.fullmatch(YEAR_MONTH_RE, t):
            year, month = gre.last_match.groups()
            month_num = {
                'JAN': '01',
                'FEB': '02',
                'MAR': '03',
                'APR': '04',
                'MAY': '05',
                'JUN': '06',
                'JUL': '07',
                'AUG': '08',
                'SEP': '09',
                'OCT': '10',
                'NOV': '11',
                'DEC': '12',
            }.get(month)
            return f"{year}-{month_num}"
        elif gre.fullmatch(YEAR_QUARTER_RE, t):
            year, quarter = gre.last_match.groups()
            return f"{year}-{quarter}"
        else:
            print(f"no match for {t}")

    def process_sheet(sheetname, tab) -> pandas.DataFrame:
        print(datetime.now(), f'spreadsheet scrape - {sheetname} - start')

        if 'Index' in sheetname or 'Contact Sheet' in sheetname:
            print(datetime.now(), f"skipping {sheetname}")
            return pandas.DataFrame()

        observations = (tab.excel_ref('C7').expand(DOWN).expand(
            RIGHT).is_not_blank().is_not_whitespace())
        Year = tab.excel_ref('C4').expand(
            RIGHT).is_not_blank().is_not_whitespace()
        Flow = tab.fill(DOWN).one_of(['Exports', 'Imports'])
        geo = tab.excel_ref('A7').expand(
            DOWN).is_not_blank().is_not_whitespace()
        geo_name = tab.excel_ref('B7').expand(
            DOWN).is_not_blank().is_not_whitespace()
        Dimensions = [
            HDim(Year, 'Period', DIRECTLY, ABOVE),
            HDim(geo, 'Geography Code', DIRECTLY, LEFT),
            HDim(geo_name, 'Geography Name', DIRECTLY, LEFT),
            HDim(Flow, 'Flow', CLOSEST, ABOVE),
            HDimConst('Measure Type', 'gbp-total'),
            HDimConst('Unit', 'gbp-million'),
        ]
        c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
        new_table = c1.topandas()

        new_table.rename(columns={
            'OBS': 'Value',
            'DATAMARKER': 'Marker'
        },
                         inplace=True)
        new_table['Flow'] = new_table['Flow'].map(lambda s: s.lower().strip())
        new_table['Product'] = product(sheetname)
        new_table['Period'] = new_table['Period'].astype(str)
        new_table['Marker'] = (new_table['Marker'].fillna('')
                               if 'Marker' in new_table else '')
        new_table = new_table[[
            'Geography Code',
            'Geography Name',
            'Period',
            'Flow',
            'Product',
            'Measure Type',
            'Value',
            'Unit',
            'Marker',
        ]]
        print(datetime.now(), f'scrape {sheetname} - complete')
        return new_table

    print(datetime.now(), "gathering info")
    scraper = Scraper(DATASET_URL)
    tabs = {
        tab.name: tab
        for tab in scraper.distribution(latest=True).as_databaker()
    }

    print(datetime.now(), f'spreadsheet scrape - start')
    table = pandas.concat([process_sheet(*args) for args in tabs.items()])
    print(datetime.now(), f'spreadsheet scrape - complete')

    table['Period'] = table.Period.str.replace('\.0', '')

    table['Period Type'] = table['Period'].apply(time2periodtype)
    table['Period Type'] = table['Period Type'].astype('category')

    table['Period'] = table['Period'].apply(time2period)
    table['Period'] = table['Period'].astype('category')
    table['Period Type'] = table['Period Type'].astype('category')

    # (pandas) "Int64" type allows null values, unlike (numpy) "int64" - yes, the case matters.
    table['Value'] = pandas.to_numeric(table['Value'],
                                       errors='coerce').astype('Int64')

    print(datetime.now(), "dropping duplicates")
    table = table.drop_duplicates()

    print(datetime.now(), "renaming columns")
    table.rename(
        columns={
            'Geography Code': 'ons_iso_alpha_2_code',
            'Geography Name': 'ons_region_name',
            'Period': "period",
            'Period Type': "period_type",
            'Flow': "direction",
            'Product': "product_name",
            'Measure Type': "measure_type",
            'Value': "value",
            'Unit': "unit",
            'Marker': "marker",
        },
        inplace=True,
    )

    print(datetime.now(), f'process_data finished', len(table))

    return [table]