Ejemplos de Scraper en Python, ejemplos de gssutils.Scraper en Python

Ejemplo n.º 1

0

Mostrar archivo

def get_current_and_next_release_date() -> Tuple[datetime, datetime]:
    scraper = gssutils.Scraper(DATASET_URL, session=requests.Session())

    return (
        datetime(
            year=scraper.dataset.issued.year,
            month=scraper.dataset.issued.month,
            day=scraper.dataset.issued.day,
            hour=0,
            minute=0,
            second=0,
        ),
        scraper.dataset.updateDueOn,
    )

Ejemplo n.º 2

0

Mostrar archivo

Archivo: main.py Proyecto: akumuthan-dev/data-flow

def get_current_and_next_release_date() -> Tuple[datetime, Optional[datetime]]:
    scraper = gssutils.Scraper(DATASET_URL, session=requests.Session())

    try:
        # The date of the next release may not be known.
        next_release_date_utc = scraper.dataset.updateDueOn
    except AttributeError:
        next_release_date_utc = None

    return (
        datetime(
            year=scraper.dataset.issued.year,
            month=scraper.dataset.issued.month,
            day=scraper.dataset.issued.day,
            hour=0,
            minute=0,
            second=0,
        ),
        next_release_date_utc,
    )

Ejemplo n.º 3

0

Mostrar archivo

def process_data():
    print(datetime.now(), 'process_data start')

    YEAR_RE = re.compile(r'[0-9]{4}')
    YEAR_MONTH_RE = re.compile(
        r'([0-9]{4})(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)')

    class Re(object):
        def __init__(self):
            self.last_match = None

        def fullmatch(self, pattern, text):
            self.last_match = re.fullmatch(pattern, text)
            return self.last_match

    def time2periodtype(t):
        gre = Re()
        if gre.fullmatch(YEAR_RE, t):
            return f"year"
        elif gre.fullmatch(YEAR_MONTH_RE, t):
            year, month = gre.last_match.groups()
            month_num = {
                'Jan': '01',
                'Feb': '02',
                'Mar': '03',
                'Apr': '04',
                'May': '05',
                'Jun': '06',
                'Jul': '07',
                'Aug': '08',
                'Sep': '09',
                'Oct': '10',
                'Nov': '11',
                'Dec': '12',
            }.get(month)
            return f"month"
        else:
            print(f"no match for {t}")

    def time2period(t):
        gre = Re()
        if gre.fullmatch(YEAR_RE, t):
            return t
        elif gre.fullmatch(YEAR_MONTH_RE, t):
            year, month = gre.last_match.groups()
            month_num = {
                'Jan': '01',
                'Feb': '02',
                'Mar': '03',
                'Apr': '04',
                'May': '05',
                'Jun': '06',
                'Jul': '07',
                'Aug': '08',
                'Sep': '09',
                'Oct': '10',
                'Nov': '11',
                'Dec': '12',
            }.get(month)
            return f"{year}-{month_num}"
        else:
            print(f"no match for {t}")

    def process_sheet(sheetname, tab):
        print(datetime.now(), f'spreadsheet scrape - {sheetname} - start')
        observations = (tab.excel_ref('C7').expand(gssutils.DOWN).expand(
            gssutils.RIGHT).is_not_blank().is_not_whitespace())
        Year = (tab.excel_ref('C5').expand(
            gssutils.RIGHT).is_not_blank().is_not_whitespace())
        geo_codes = (tab.excel_ref('A7').expand(
            gssutils.DOWN).is_not_blank().is_not_whitespace())
        geo_names = (tab.excel_ref('B7').expand(
            gssutils.DOWN).is_not_blank().is_not_whitespace())
        Dimensions = [
            gssutils.HDim(Year, 'period', gssutils.DIRECTLY, gssutils.ABOVE),
            gssutils.HDim(
                geo_codes,
                'ons_iso_alpha_2_code',
                gssutils.DIRECTLY,
                gssutils.LEFT,
            ),
            gssutils.HDim(
                geo_names,
                'ons_region_name',
                gssutils.DIRECTLY,
                gssutils.LEFT,
            ),
            gssutils.HDimConst('measure_type', 'gbp-total'),
            gssutils.HDimConst('unit', 'gbp-million'),
        ]
        c1 = gssutils.ConversionSegment(observations,
                                        Dimensions,
                                        processTIMEUNIT=True)
        new_table = c1.topandas()
        new_table.rename(columns={
            "OBS": "value",
            "DATAMARKER": "marker"
        },
                         inplace=True)

        if 'Imports' in sheetname:
            new_table['direction'] = 'imports'
        elif 'Exports' in sheetname:
            new_table['direction'] = 'exports'
        else:
            new_table['direction'] = 'other'

        new_table['period'] = new_table['period'].astype(str)
        new_table = new_table[[
            'ons_iso_alpha_2_code',
            'ons_region_name',
            'period',
            'direction',
            'measure_type',
            'value',
            'unit',
            'marker',
        ]]
        print(datetime.now(), f'scrape {sheetname} - complete')
        return new_table

    scraper = gssutils.Scraper(DATASET_URL, session=requests.Session())
    tabs = {tab.name: tab for tab in scraper.distributions[0].as_databaker()}

    print(datetime.now(), f'spreadsheet scrape - start')
    table = pandas.concat([process_sheet(*args) for args in tabs.items()])
    print(datetime.now(), f'spreadsheet scrape - complete')

    table['period'] = table.period.str.replace('\.0', '')

    table['period_type'] = table['period'].apply(time2periodtype)
    table['period_type'] = table['period_type'].astype('category')

    table['period'] = table['period'].apply(time2period)
    table['period'] = table['period'].astype('category')
    table['period_type'] = table['period_type'].astype('category')

    # (pandas) "Int64" type allows null values, unlike (numpy) "int64" - yes, the case matters.
    table['value'] = pandas.to_numeric(table['value'],
                                       errors='coerce').astype('Int64')
    table['marker'].replace('N/A', 'not-applicable', inplace=True)

    table = table[[
        'ons_iso_alpha_2_code',
        'ons_region_name',
        'period',
        'period_type',
        'direction',
        'measure_type',
        'value',
        'unit',
        'marker',
    ]]

    # Fix some broken data
    table.loc[table['ons_region_name'] == 'Total EU ',
              'ons_region_name'] = 'Total EU'
    table.loc[table['ons_region_name'] == 'Total Extra EU  (Rest of World)',
              'ons_region_name'] = 'Total Extra EU (Rest of World)'

    print(datetime.now(), f'transformed and normalised data')

    return [table.drop_duplicates()]

Ejemplo n.º 4

0

Mostrar archivo

Archivo: main.py Proyecto: akumuthan-dev/data-flow

def process_data() -> List[pandas.DataFrame]:
    print(datetime.now(), 'process_data start')

    YEAR_RE = re.compile(r'[0-9]{4}')
    YEAR_MONTH_RE = re.compile(
        r'([0-9]{4})\s+(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)')
    YEAR_QUARTER_RE = re.compile(r'([0-9]{4})(Q[1-4])')

    class Re(object):
        def __init__(self):
            self.last_match = None

        def fullmatch(self, pattern, text):
            self.last_match = re.fullmatch(pattern, text)
            return self.last_match

    def time2periodtype(t):
        gre = Re()
        if gre.fullmatch(YEAR_RE, t):
            return "year"
        elif gre.fullmatch(YEAR_MONTH_RE, t):
            return "month"
        elif gre.fullmatch(YEAR_QUARTER_RE, t):
            return "quarter"
        else:
            print(f"no match for {t}")

    def time2period(t):
        gre = Re()
        if gre.fullmatch(YEAR_RE, t):
            return f"year/{t}"
        elif gre.fullmatch(YEAR_MONTH_RE, t):
            year, month = gre.last_match.groups()
            month_num = {
                'JAN': '01',
                'FEB': '02',
                'MAR': '03',
                'APR': '04',
                'MAY': '05',
                'JUN': '06',
                'JUL': '07',
                'AUG': '08',
                'SEP': '09',
                'OCT': '10',
                'NOV': '11',
                'DEC': '12',
            }.get(month)
            return f"month/{year}-{month_num}"
        elif gre.fullmatch(YEAR_QUARTER_RE, t):
            year, quarter = gre.last_match.groups()
            return f"quarter/{year}-{quarter}"
        else:
            print(f"no match for {t}")

    def user_perc(x):
        if str(x) == '-':
            return 'itis-nil'
        elif str(x) == '..':
            return 'disclosive'
        else:
            return None

    print(datetime.now(), "loading dataframe")

    scraper = gssutils.Scraper(DATASET_URL, session=requests.Session())
    dist = scraper.distribution(latest=True, mediaType=Excel)
    tab = dist.as_pandas(sheet_name='Time Series')

    tab = tab.iloc[1:, :]

    tab.columns.values[0] = 'Flow'
    tab.columns.values[1] = 'Trade Services Code'
    tab.columns.values[2] = 'Trade Services Name'
    tab.columns.values[3] = 'Geography Code'
    tab.columns.values[4] = 'Geography Name'

    print(datetime.now(), "melting dataframe")

    new_table = pandas.melt(
        tab,
        id_vars=[
            'Flow',
            'Trade Services Code',
            'Trade Services Name',
            'Geography Code',
            'Geography Name',
        ],
        var_name='Period',
        value_name='Value',
    )

    print(datetime.now(), "cleaning dataframe")

    new_table['Trade Services Code'] = new_table['Trade Services Code'].astype(
        str)

    new_table['Period'] = new_table['Period'].astype(str)
    new_table['Period'] = new_table['Period'].astype('category')

    new_table['Period Type'] = new_table['Period'].apply(time2periodtype)
    new_table['Period Type'] = new_table['Period Type'].astype('category')

    new_table['Period'] = new_table['Period'].apply(time2period)
    new_table['Period'] = new_table['Period'].astype('category')

    new_table['Flow'] = new_table['Flow'].map(lambda s: s.lower().strip())

    new_table['Seasonal Adjustment'] = 'NSA'
    new_table['Measure Type'] = 'GBP Total'
    new_table['Unit'] = 'gbp-million'

    new_table['Marker'] = new_table.apply(lambda row: user_perc(row['Value']),
                                          axis=1)

    new_table['Value'] = pandas.to_numeric(new_table['Value'], errors='coerce')

    indexNames = new_table[new_table['Trade Services Code'].str.contains(
        'nan', na=True)].index
    new_table.drop(indexNames, inplace=True)

    print(datetime.now(), "preparing final dataframe")
    new_table = new_table[[
        'Geography Code',
        'Geography Name',
        'Period',
        'Period Type',
        'Flow',
        'Trade Services Code',
        'Trade Services Name',
        'Measure Type',
        'Value',
        'Unit',
        'Marker',
    ]]
    new_table.rename(
        columns={
            'Geography Code': 'ons_iso_alpha_2_code',
            'Geography Name': 'ons_region_name',
            'Period': "period",
            'Period Type': "period_type",
            'Flow': "direction",
            'Trade Services Code': "product_code",
            'Trade Services Name': "product_name",
            'Measure Type': "measure_type",
            'Value': "value",
            'Unit': "unit",
            'Marker': "marker",
        },
        inplace=True,
    )

    print(datetime.now(), f'process_data finished', len(new_table))

    return [new_table]