Esempio n. 1
0
    def __init__(self, lst=[]):

        self.table = None

        lst_type = type(lst)

        if lst_type in [list, tuple]:

            # Check for empty list
            if not len(lst):
                self.table = petl.fromdicts([])
            else:
                row_type = type(lst[0])
                # Check for list of dicts
                if row_type == dict:
                    self.table = petl.fromdicts(lst)
                # Check for list of lists
                elif row_type in [list, tuple]:
                    self.table = petl.wrap(lst)

        else:
            # Create from a petl table
            self.table = lst

        if not self.is_valid_table():
            raise ValueError("Could not create Table")

        # Count how many times someone is indexing directly into this table, so we can warn
        # against inefficient usage.
        self._index_count = 0
def create_csv_based_on_peoples(peoples):
    pathlib.Path(settings.CSV_FILES_DIRECTORY).mkdir(parents=True,
                                                     exist_ok=True)
    file_path = os.path.join(settings.CSV_FILES_DIRECTORY, get_file_name())
    petl.fromdicts(peoples).convert(
        'edited', convert_iso_datetime_string_to_string_date).rename(
            'edited', 'date').cut(HEADERS).tocsv(file_path)

    return Dataset.objects.create(file=file_path)
def transform(characters: Iterator[dict], planets: Iterator[dict]) -> Table:
    planets_table = (etl.fromdicts(planets).addfield(
        "id", lambda row: row.url.split("/")[-2]).rename(
            "name", "planet_name").cut("id", "planet_name"))
    characters_table = (etl.fromdicts(characters).addfield(
        "date", lambda row: row.edited.split("T")[0]).convert(
            "homeworld", lambda hw: hw.split("/")[-2]).join(
                planets_table,
                lkey="homeworld", rkey="id").cutout("homeworld").rename(
                    "planet_name",
                    "homeworld").cutout("films", "species", "vehicles",
                                        "starships", "created", "edited",
                                        "url"))

    return characters_table
Esempio n. 4
0
def test_fromdicts_1():

    data = [{"foo": "a", "bar": 1}, {"foo": "b", "bar": 2}, {"foo": "c", "bar": 2}]
    actual = fromdicts(data)
    expect = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
def _write_to_log(parameter_view, sample_rows, resource_index):
    parameter_view = json.dumps(parameter_view, ensure_ascii=False, indent=4)
    table_view = petl.look(petl.fromdicts(sample_rows))

    logging.info('Processed resource %s', resource_index)
    logging.info('Parameters = %s', parameter_view)
    logging.info('Sample output: \n%s', table_view)
Esempio n. 6
0
def extract(profile, workdir):
    dicts = []
    outer = parse(profile)
    station_ids = outer.findall('./stationID')
    updatetime_str = outer.getroot().attrib.get('updatetime')
    modify_time = parser.parse(updatetime_str, ignoretz=True) if updatetime_str else datetime.datetime.now()
    for station_id_tree in station_ids:
        status = station_id_tree.find('./status')
        if status is not None and status.find('./station') is not None:
            if status.find('./station').text.strip().startswith(u'無觀測'):
                continue
        profile = station_id_tree.find('./profile')
        station_id = station_id_tree.attrib['id'].strip()
        # filename = station_id + '.xml'
        outer_data = {
            'station_id': station_id,
            'seas_chName': profile.find('./seas_chName').text.strip(),
            'latitude': float(profile.find('./latitude').text.strip()),
            'longitude': float(profile.find('./longitude').text.strip()),
            'chName': getattr(profile.find('./chName'), 'text', None) and profile.find('./chName').text.strip(),
            'chCity': getattr(profile.find('./chCity'), 'text', None) and profile.find('./chCity').text.strip(),
            'kind_chName': getattr(profile.find('./kind_chName'), 'text', None) and profile.find('./kind_chName').text.strip(),
            'chTown':  getattr(profile.find('./chTown'), 'text', None) and profile.find('./chTown').text.strip(),
            'chLocation': getattr(profile.find('./chLocation'), 'text', None) and profile.find('./chLocation').text.strip(),
            'dataItem': getattr(profile.find('./dataItem'), 'text', None) and profile.find('./dataItem').text.strip(),
          #  'file_path': (workdir + filename) if workdir.endswith('/') else ('%s/%s' % (workdir, filename)),
            'modifytime': modify_time,
            'updatetime': datetime.datetime.now(),
        }
        dicts.append(outer_data)
    return petl.wrap([row for row in petl.fromdicts(dicts)])
def test_append_to_file_for_csv(user_export_file, tmpdir, media_root):
    # given
    export_data = [
        {"id": "123", "name": "test1", "collections": "coll1"},
        {"id": "345", "name": "test2"},
    ]
    headers = ["id", "name", "collections"]
    delimiter = ";"

    file_name = "test.csv"

    table = etl.fromdicts([{"id": "1", "name": "A"}], header=headers, missing=" ")

    with NamedTemporaryFile() as temp_file:
        etl.tocsv(table, temp_file.name, delimiter=delimiter)
        user_export_file.content_file.save(file_name, temp_file)

    # when
    append_to_file(export_data, headers, user_export_file, FileTypes.CSV, delimiter)

    # then
    user_export_file.refresh_from_db()

    csv_file = user_export_file.content_file
    file_content = csv_file.read().decode().split("\r\n")
    assert ";".join(headers) in file_content
    assert ";".join(export_data[0].values()) in file_content
    assert (";".join(export_data[1].values()) + "; ") in file_content

    shutil.rmtree(tmpdir)
Esempio n. 8
0
def shcopy_files(data, source, source_field, destination_field, results,
                 quiet):
    """Copy files from the source to the destination."""
    source = data.get(source)
    results_data = []
    for i in source.dicts():
        r = {
            source_field: i[source_field],
            destination_field: i[destination_field]
        }
        try:
            shutil.copy(i[source_field], i[destination_field])
            r['success'] = True
            r['message'] = "File copied successfully."
            if not quiet:
                unsync.secho('Successfully copied {} to {}'.format(
                    i[source_field], i[destination_field]),
                             fg='green')
        except (shutil.Error, IOError) as e:
            r['success'] = False
            r['message'] = str(e)
            if not quiet:
                unsync.secho('Failed copying {} to {}. Reason was: {}'.format(
                    i[source_field], i[destination_field]),
                             str(e),
                             fg='red')

        results_data.append(r)

    if results:
        results_data = petl.fromdicts(results_data)
        data.set(results, results_data)
Esempio n. 9
0
def transform(data,data_set):
	data = data['observations']
	data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date'])
	data = etl.cut(data,'date','value')	
	data = etl.rename(data,{'date':'date','value': data_set.lower()})
	data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val)
	return data
Esempio n. 10
0
def fetch_all_characters_data():
    for characters_page in fetch_all_data_by_pages(
            f'{settings.STAR_WARS_API_URL}/people/?format=json', ):
        yield etl.fromdicts(
            characters_page,
            header=settings.STAR_WARS_CHARACTERS_INPUT_API_FIELDS,
        )
Esempio n. 11
0
def list_account_users(data, url, api_key, account_id, search_term, destination):
    if not url.startswith('http') or not url.startswith('https'):
        url = 'https://' + url

    client = UsersAPI(url, api_key)
    r = client.list_users_in_account(account_id, search_term)
    data.cat(destination, petl.fromdicts(r))
Esempio n. 12
0
 def people_view(self):
     """Object pretending to be ``PeopleCollectionView`` instance."""
     return petl.fromdicts(
         [
             {
                 'name': 'Person 1',
                 'homeworld': 'planets/1',
                 'edited': '2020-08-30T20:58:18.420000Z',
             },
             {
                 'name': 'Person 2',
                 'homeworld': 'planets/4',
                 'edited': '2020-08-31T13:47:18.480000Z',
             },
             {
                 'name': 'Person 3',
                 'homeworld': 'planets/1',
                 'edited': '2020-08-30T20:58:18.420000Z',
             },
             {
                 'name': 'Person 4',
                 'homeworld': 'planets/2',
                 'edited': '2020-08-31T06:31:18.370000Z',
             },
             {
                 'name': 'Person 5',
                 'homeworld': 'planets/1',
                 'edited': '2020-09-01T06:31:18.370000Z',
             },
         ],
         header=('name', 'homeworld', 'edited'),
     )
Esempio n. 13
0
def test_fromdicts_2():

    data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}]
    actual = fromdicts(data)
    expect = (("foo", "bar", "baz"), ("a", 1, None), ("b", None, None), ("c", 2, True))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
Esempio n. 14
0
def list_imports(data, url, api_key, account_id, created_since, destination,
                 include_errors, include_warnings, flatten_data):
    if not url.startswith('http') or not url.startswith('https'):
        url = 'https://' + url

    client = SisImportsAPI(url, api_key)
    r = client.get_sis_import_list(account_id, created_since=created_since)
    t = []
    for i in r:
        if not include_errors:
            i['processing_errors'] = "{} Errors Ocurrred".format(
                len(i.get('processing_errors', [])))
        if not include_warnings:
            i['processing_warnings'] = "{} Warnings Ocurrred".format(
                len(i.get('processing_warnings', [])))
        if flatten_data:
            d = i['data']
            i['diffed_against_sis_batch_id'] = d.get(
                'diffed_against_sis_batch_id', None)
            i['import_type'] = d.get('import_type', None)
            c = d.get('counts', {})
            i['counts'] = ';'.join(
                ["{}: {}".format(k, c[k]) for k in c.keys()])
            i['supplied_batches'] = ';'.join(d.get('supplied_batches', []))
            del (i['data'])
        t.append(i)

    import_data = petl.fromdicts(t)

    data.set(destination, import_data)
Esempio n. 15
0
def test_fromdicts_3():

    data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}]
    actual = fromdicts(data, header=["foo", "bar"])
    expect = (("foo", "bar"), ("a", 1), ("b", None), ("c", 2))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
Esempio n. 16
0
    def download_new_collection(cls) -> None:
        # store small dictionary for later on transofrmation
        planets_arr = {}
        for planets in SWAPI.fetch_data(settings.SW_PLANETS_URL):
            planets_arr.update({i['url']: i['name'] for i in planets})

        create = True
        file_name = '{}.csv'.format(time())
        csv_path = Path(CSV_PATH, file_name)

        for people in SWAPI.fetch_data(settings.SW_PEOPLE_URL):
            table = etl.fromdicts(
                people,
                header=[
                    'name', 'height', 'mass', 'hair_color', 'skin_color',
                    'eye_color', 'birth_year', 'gender', 'homeworld', 'edited'
                ]).convert('edited', lambda v: v[0:10]).convert(
                    'homeworld',
                    lambda v: planets_arr.get(v, '')).rename('edited', 'date')

            if create:
                etl.tocsv(table, source=csv_path, write_header=True)
                create = False
            else:
                etl.appendcsv(table, source=csv_path)

        c = SWPeopleCollection()
        c.file.name = file_name
        c.save()
Esempio n. 17
0
def get_user_logins(data, url, api_key, user_data, user_id_field, destination):

    if not url.startswith('http') or not url.startswith('https'):
        url = 'https://' + url

    user_data = data.get(user_data)
    user_data = user_data.dicts()

    login_data = []

    debug = data.config.debug

    client = LoginsAPI(url, api_key)
    for user in user_data:
        try:
            r = client.list_user_logins_users(user[user_id_field])
            if debug:
                unsync.secho(
                    'Retrieved {} Canvas Logins for Canvas User ID: {}'.format(
                        len(r), user[user_id_field]),
                    fg='green')
        except CanvasAPIError:
            unsync.secho(
                'Unable to retrieve Canvas Login information for Canvas User ID: {}'
                .format(user[user_id_field]),
                fg='red')
        for login in r:
            login_data.append(login)

    login_data = petl.fromdicts(login_data)
    data.set(destination, login_data)
Esempio n. 18
0
def update_user_logins(data, url, api_key, source, account_id_field,
                       login_id_field, unique_id_field, password_field,
                       sis_user_id_field, integration_id_field, results_table):
    if not url.startswith('http') or not url.startswith('https'):
        url = 'https://' + url

    client = LoginsAPI(url, api_key)

    source = data.get(source)

    debug = data.config.debug

    results = []

    for row in petl.dicts(source):
        account_id = row[account_id_field]
        login_id = row[login_id_field]

        kwargs = {}
        if unique_id_field is not None and row[unique_id_field] is not None:
            kwargs['login_unique_id'] = row[unique_id_field]
        if password_field is not None and row[password_field] is not None:
            kwargs['login_password'] = row[password_field]
        if sis_user_id_field is not None and row[sis_user_id_field] is not None:
            kwargs['login_sis_user_id'] = row[sis_user_id_field]
        if integration_id_field is not None and row[
                integration_id_field] is not None:
            kwargs['login_integration_id'] = row[integration_id_field]

        try:
            r = client.edit_user_login(login_id, account_id, **kwargs)
            unsync.secho('Successfully updated login: {} with data: {}'.format(
                login_id, str(kwargs)),
                         fg='green')

            if results_table:
                row['_data'] = str(kwargs)
                row['_response_status'] = r
                row['_response_content'] = r
                results.append(row)

            if debug:
                unsync.secho(str(r), fg='yellow')
        except (CanvasAPIError) as e:
            unsync.secho('Failed updating login: {} with data: {}'.format(
                login_id, str(kwargs)),
                         fg='red')
            unsync.secho('Response Status: {} Response Reason: {}'.format(
                e.response.status_code, e.response.content),
                         fg='red')

            if results_table:
                row['_data'] = str(kwargs)
                row['_response_status'] = e.response.status_code
                row['_response_content'] = e.response.content
                results.append(row)

    results = petl.fromdicts(results)
    data.cat(results_table, results)
Esempio n. 19
0
def get_account_logins(data, url, api_key, account_id, destination):
    if not url.startswith('http') or not url.startswith('https'):
        url = 'https://' + url

    client = LoginsAPI(url, api_key)
    r = client.list_user_logins_accounts(account_id)
    d = petl.fromdicts(r)
    data.set(destination, d)
Esempio n. 20
0
def test_append_to_file_for_xlsx(user_export_file, tmpdir, media_root):
    # given
    export_data = [
        {
            "id": "123",
            "name": "test1",
            "collections": "coll1"
        },
        {
            "id": "345",
            "name": "test2"
        },
    ]
    expected_headers = ["id", "name", "collections"]
    delimiter = ";"

    table = etl.fromdicts([{
        "id": "1",
        "name": "A"
    }],
                          header=expected_headers,
                          missing=" ")

    temp_file = NamedTemporaryFile(suffix=".xlsx")
    etl.io.xlsx.toxlsx(table, temp_file.name)

    # when
    append_to_file(export_data, expected_headers, temp_file, FileTypes.XLSX,
                   delimiter)

    # then
    user_export_file.refresh_from_db()

    wb_obj = openpyxl.load_workbook(temp_file)

    sheet_obj = wb_obj.active
    max_col = sheet_obj.max_column
    max_row = sheet_obj.max_row
    expected_headers = expected_headers
    headers = [
        sheet_obj.cell(row=1, column=i).value for i in range(1, max_col + 1)
    ]
    data = []
    for i in range(2, max_row + 1):
        row = []
        for j in range(1, max_col + 1):
            row.append(sheet_obj.cell(row=i, column=j).value)
        data.append(row)

    assert headers == expected_headers
    assert list(export_data[0].values()) in data
    row2 = list(export_data[1].values())
    # add string with space for collections column
    row2.append(" ")
    assert row2 in data

    temp_file.close()
    shutil.rmtree(tmpdir)
Esempio n. 21
0
def format_data_sample(stream):
    """Return a table representation of a sample of the streamed data."""

    keyed_rows = []
    for row in stream.sample:
        keyed_rows.append(dict(zip(stream.headers, row)))

    petl_table = fromdicts(keyed_rows)
    return repr(look(petl_table, limit=None))
def format_data_sample(stream):
    """Return a table representation of a sample of the streamed data."""

    keyed_rows = []
    for row in stream.sample:
        keyed_rows.append(dict(zip(stream.headers, row)))

    petl_table = fromdicts(keyed_rows)
    return repr(look(petl_table, limit=None))
Esempio n. 23
0
 def planets_view(self):
     """Object pretending to be ``PlanetsCollectionView`` instance."""
     return petl.fromdicts(
         [{
             'name': 'Planet {0}'.format(index),
             'url': 'planets/{0}'.format(index),
         } for index in range(1, 5)],
         header=('url', 'name'),
     )
    def _show(stream):
        """Return a table of sample data."""

        keyed_rows = []
        for row in stream.sample:
            keyed_rows.append(dict(zip(stream.headers, row)))
        petl_table = fromdicts(keyed_rows)

        return repr(look(petl_table, limit=None))
    def _show(stream):
        """Return a table of sample data."""

        keyed_rows = []
        for row in stream.sample:
            keyed_rows.append(dict(zip(stream.headers, row)))
        petl_table = fromdicts(keyed_rows)

        return repr(look(petl_table, limit=None))
Esempio n. 26
0
def test_fromdicts_1():
    
    data = [{"foo": "a", "bar": 1}, {"foo": "b", "bar": 2}, {"foo": "c", "bar": 2}]
    actual = fromdicts(data)
    expect = (('foo', 'bar'),
              ('a', 1),
              ('b', 2),
              ('c', 2))
    ieq(expect, actual)
    ieq(expect, actual) # verify can iterate twice
Esempio n. 27
0
def test_fromdicts_3():
    
    data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}]
    actual = fromdicts(data, header=['foo', 'bar'])
    expect = (('foo', 'bar'),
              ('a', 1),
              ('b', None),
              ('c', 2))
    ieq(expect, actual)
    ieq(expect, actual) # verify can iterate twice
Esempio n. 28
0
def geojson_fc_to_csv(fc):
    rows = []
    for f in fc['features']:
        feature = {}
        feature.update(f['properties'])
        geom = f['geometry']['coordinate']
        feature.update({'lon': geom[0], 'lat': geom[1]})
        rows.append(feature)
    table = etl.fromdicts(rows)
    return etl.lookall(table)
Esempio n. 29
0
def test_fromdicts_2():
    
    data = [{"foo": "a", "bar": 1}, {"foo": "b"}, {"foo": "c", "bar": 2, "baz": True}]
    actual = fromdicts(data)
    expect = (('foo', 'bar', 'baz'),
              ('a', 1, None),
              ('b', None, None),
              ('c', 2, True))
    ieq(expect, actual)
    ieq(expect, actual) # verify can iterate twice
Esempio n. 30
0
def transform_data(data):
    tbl_data = petl.fromdicts(data)
    tbl_data = petl.convert(
        tbl_data, {k: v['value']
                   for k, v in fields_to_transform.items()})
    tbl_data = petl.rename(
        tbl_data, {k: v['key']
                   for k, v in fields_to_transform.items()})
    tbl_data_allowed = petl.cut(tbl_data, *allowed_fields)

    return tbl_data_allowed
Esempio n. 31
0
def format_results(results, f, geodata_model):
    """handle parsing the format argument to convert 
    the results 'table' into one of the desired formats
    
    :param results: [description]
    :type results: [type]
    :param f: [description]
    :type f: [type]
    :param geodata_model: [description]
    :type geodata_model: [type]
    :return: [description]
    :rtype: [type]
    """
    
    # make submitted value lowercase, to simplify comparison
    f = f.lower()
    # fall back to JSON if no format provided
    if f not in F_ALL:
        f = F_JSON[0]

    # JSON format 
    if f in F_JSON:

        if f == 'time':
            # grouped by timestamp
            return _groupby(results, key='ts', sortby='id')
        elif f == 'sensor':
            # grouped by id
            return _groupby(results, key='id', sortby='ts')
        else:
            # (list of dicts)
            return results

    # GEOJSON format (GeoJSON Feature collection; results under 'data' key within properties)
    # elif f in F_GEOJSON:
    #     results = _groupby(results, key='id', sortby='ts')
    #     return _format_as_geojson(results, geodata_model)

    # ARRAYS format (2D table)
    elif f in F_ARRAYS:
        # nested arrays
        t = etl.fromdicts(results)
        #h = list(etl.header(t))
        #return list(etl.data(t)).insert(0,h)
        return list(etl.data(t))

    elif f in F_CSV:
        return _format_teragon(results)

    # elif f in F_MD:
    #     return results

    else:
        return results
Esempio n. 32
0
def test_fromdicts_ordered():
    from collections import OrderedDict
    data = [
        OrderedDict([('foo', 'a'), ('bar', 1)]),
        OrderedDict([('foo', 'b')]),
        OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)])
    ]
    actual = fromdicts(data)
    # N.B., fields come out in original order
    expect = (('foo', 'bar', 'baz'), ('a', 1, None), ('b', None, None),
              ('c', 2, True))
    ieq(expect, actual)
Esempio n. 33
0
def test_fromdicts_3():

    data = [{'foo': 'a', 'bar': 1},
            {'foo': 'b'},
            {'foo': 'c', 'bar': 2, 'baz': True}]
    actual = fromdicts(data, header=['foo', 'bar'])
    expect = (('foo', 'bar'),
              ('a', 1),
              ('b', None),
              ('c', 2))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
Esempio n. 34
0
def test_fromdicts_ordered():
    from collections import OrderedDict
    data = [OrderedDict([('foo', 'a'), ('bar', 1)]),
            OrderedDict([('foo', 'b')]),
            OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)])]
    actual = fromdicts(data)
    # N.B., fields come out in original order
    expect = (('foo', 'bar', 'baz'),
              ('a', 1, None),
              ('b', None, None),
              ('c', 2, True))
    ieq(expect, actual)
Esempio n. 35
0
def run(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument("--filename", type=str, help='Input filename', required=True)
    # parser.add_argument("--max_iterations", type=int, help='Max number of requests', default=1000)

    known_args, _ = parser.parse_known_args()

    file_content = [line for line in read_file(known_args.filename)]
    table = petl.fromdicts(file_content)
    tokenized_table = tokenize(table)
    petl.tocsv(tokenized_table, 'words.csv')
Esempio n. 36
0
def test_fromdicts_onepass():

    # check that fromdicts() only makes a single pass through the data
    data = iter([{'foo': 'a', 'bar': 1},
                 {'foo': 'b', 'bar': 2},
                 {'foo': 'c', 'bar': 2}])
    actual = fromdicts(data, header=['foo', 'bar'])
    expect = (('foo', 'bar'),
              ('a', 1),
              ('b', 2),
              ('c', 2))
    ieq(expect, actual)
Esempio n. 37
0
def test_fromdicts_2():

    data = [{'foo': 'a', 'bar': 1},
            {'foo': 'b'},
            {'foo': 'c', 'bar': 2, 'baz': True}]
    actual = fromdicts(data, header=['bar', 'baz', 'foo'])
    expect = (('bar', 'baz', 'foo'),
              (1, None, 'a'),
              (None, None, 'b'),
              (2, True, 'c'))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
Esempio n. 38
0
def get_enrollment_terms(data, url, api_key, account_id, state, destination):
    if not url.startswith('http') or not url.startswith('https'):
        url = 'https://' + url

    term_data = []

    client = EnrollmentTermsAPI(url, api_key)
    r = client.list_enrollment_terms(account_id, state)
    for term in r:
        term_data.append(term)

    term_data = petl.fromdicts(term_data)
    data.cat(destination, term_data)
Esempio n. 39
0
def write_data(filepath: PosixPath, people_data: dict, planets: Dict[str,
                                                                     str]):
    settings.DATA_PATH.mkdir(parents=True, exist_ok=True)
    for data in people_data:
        table: DictsView = etl.fromdicts(data['results']).convert(
            'homeworld', planets).addfield(
                'date',
                lambda row: parse(row['edited']).strftime('​%Y-%m-%d')).cutout(
                    *CUTOUT_FIELDS)
        if filepath.is_file():
            table.appendcsv(filepath)
        else:
            table.tocsv(filepath)
Esempio n. 40
0
def append_to_file(
    export_data: List[Dict[str, Union[str, bool]]],
    headers: List[str],
    temporary_file: Any,
    file_type: str,
    delimiter: str,
):
    table = etl.fromdicts(export_data, header=headers, missing=" ")

    if file_type == FileTypes.CSV:
        etl.io.csv.appendcsv(table, temporary_file.name, delimiter=delimiter)
    else:
        etl.io.xlsx.appendxlsx(table, temporary_file.name)
Esempio n. 41
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('lat', type=float)
    parser.add_argument('lon', type=float)
    parser.add_argument(
        'name',
        nargs='+',
        action='append',
        help='Peak name to search for - multiple words will form one search term'
    )
    args = parser.parse_args()
    peak_name = ' '.join(*args.name) or 'Mount Stuart'

    peak = Peak(peak_name, args.lat, args.lon)
    print('Finding trip reports for', peak)

    all_trip_reports = get_all_trip_reports(peak)

    petl.fromdicts(
        [report._asdict() for report in all_trip_reports],
        header=TripReportSummary._fields
    ).tocsv()
Esempio n. 42
0
def test_fromdicts_header_list():
    from collections import OrderedDict
    data = [
        OrderedDict([('foo', 'a'), ('bar', 1)]),
        OrderedDict([('foo', 'b'), ('bar', 2)]),
        OrderedDict([('foo', 'c'), ('bar', 2)])
    ]
    actual = fromdicts(data)
    header = actual.header()
    assert header == ('foo', 'bar')
    expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2))
    ieq(expect, actual)
    ieq(expect, actual)
Esempio n. 43
0
def print_table(ctx):
    """Output a list of pipelines as table."""

    rows = [dict(source.state) for source in ctx.obj['sources']]
    message = '\nNumber of pipelines = {}\n'
    secho(message.format(len(rows)), **SUCCESS)

    subset = [
        'id', 'pipeline_status', 'validation_status', 'nb_validation_errors',
        'scraper_required', 'resource_type', 'extension'
    ]
    sorted_rows = sort(cut(fromdicts(rows), *subset), key='id')
    echo(look(sorted_rows, limit=None))
Esempio n. 44
0
        def process_rows(resource_):
            for row_index, row in enumerate(resource_):
                if pass_row_index:
                    parameters.update(row_index=row_index)

                new_row = row_processor(row, **parameters)
                yield new_row

                if verbose and row_index < LOG_SAMPLE_SIZE:
                    sample_rows.append(new_row)

            if verbose:
                table = look(fromdicts(sample_rows), limit=LOG_SAMPLE_SIZE)
                message = 'Output of processor %s for resource %s is...\n%s'
                args = row_processor.__name__, resource_index, table
                logging.info(message, *args)
Esempio n. 45
0
def print_table(ctx):
    """Output a list of pipelines as table."""

    rows = [dict(source.state) for source in ctx.obj['sources']]
    message = '\nNumber of pipelines = {}\n'
    secho(message.format(len(rows)), **SUCCESS)

    subset = [
        'id',
        'pipeline_status',
        'validation_status',
        'nb_validation_errors',
        'scraper_required',
        'resource_type',
        'extension'
    ]
    sorted_rows = sort(cut(fromdicts(rows), *subset), key='id')
    echo(look(sorted_rows, limit=None))
Esempio n. 46
0
def scrape():
    res = requests.get(BASE_URL)
    doc = html.fromstring(res.content)

    query = {}
    for inp in doc.findall('.//form[@name="toetuse_saajad"]//input'):
        if inp.get('type') == 'submit':
            continue
        query[inp.get('name')] = inp.get('value')

    rows = []
    for option in doc.findall('.//select[@id="meede"]/option'):
        measure = option.get('value')
        if len(measure):
            rows.extend(list(scrape_measure(query, measure)))

    tojson(fromdicts(rows), 'Estonia_scraper_dump.json',
           sort_keys=True, **JSON_FORMAT)
Esempio n. 47
0
def season_game_bucket_combo_probs(sbs, gbs):

    # join bucketed season performance to bucketed game performance
    sg = sbs.join(gbs, key=('year', 'name'), lprefix='s_', rprefix='g_')

    # get all possible season/game bucket combos as our master list
    all_bucket_combos = petl.fromdicts([
        {'s_bucket': s, 'g_bucket': g}
        for s in set(sg['s_bucket'])
        for g in set(sg['g_bucket'])
    ])

    # get counts of all existing season/game bucket combos
    game_bucket_counts = sg.aggregate(('s_bucket', 'g_bucket'), len)

    # generate counts for all season/game bucket combos and fill in
    # any missing values with 0
    all_game_bucket_counts = (
        all_bucket_combos
        .leftjoin(
            game_bucket_counts,
            key=('s_bucket', 'g_bucket'),
            missing=0,
        )
        .rename({'value': 'g_count'})
    )

    # calculate season bucket counts
    s_bucket_game_counts = (
        all_game_bucket_counts
        .aggregate('s_bucket', sum, 'g_count')
        .rename({'value': 's_count'})
    )

    # get probs for each season-game turnout
    g_bucket_probs = (
        all_game_bucket_counts
        .join(s_bucket_game_counts, key=('s_bucket'))
        .addfield('prob', lambda r: r['g_count'] / r['s_count'])
        .cut(('s_bucket', 'g_bucket', 'g_count', 's_count', 'prob'))
    )

    return g_bucket_probs
Esempio n. 48
0
def attendance_file2table(filename, output_csv_filebase, add_extra_fields):
    global full_name2sk_indiv_id

    print "*** Parsing file: " + filename
    print

    attendance_dicts = []

    # CCB's Worship Service event IDs...
    event_ids = {}
    event_ids["8"] = 6
    event_ids["9"] = 7
    event_ids["10"] = 8
    event_ids["11:15"] = 9
    event_ids["Christmas"] = 13

    # The following are used to create CSV output filenames and to emit human-readable event name if add_extra_fields
    # flag is on
    event_names = {}
    event_names[6] = "08am"
    event_names[7] = "09am"
    event_names[8] = "10am"
    event_names[9] = "11_15am"
    event_names[13] = "Christmas Eve"

    # Time of event in Excel-parseable format
    event_times = {}
    event_times[6] = "08:00 AM"
    event_times[7] = "09:00 AM"
    event_times[8] = "10:00 AM"
    event_times[9] = "11:15 AM"
    event_times[13] = "04:00 PM"

    # Starting state...
    prior_line = None
    matched_month_year = None
    matched_service_time = None
    month = None
    year = None
    service_time = None
    line_number = 1
    total_row_dict = None
    event_id = None
    accumulated_row_totals_dict = {"week1": 0, "week2": 0, "week3": 0, "week4": 0, "week5": 0, "week6": 0, "total": 0}
    full_name = None
    phone = None
    num_processed_lines = 0

    for line in open(filename):

        # First pick off line at front of file indicating month and year that this attendance file is for...
        if not matched_month_year:
            matched_month_year = re.search("For the month of ([A-Z][a-z]+), ([0-9]{4})", line)
            if matched_month_year:
                month = string2monthnum(matched_month_year.group(1))
                year = string2yearnum(matched_month_year.group(2))
                if not (month and year):
                    print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                    print >> sys.stderr, "*** ERROR! Invalid month or year found"
                    print >> sys.stderr, line
                    print >> sys.stderr
                    sys.exit(1)
                first_day_in_month, num_days_in_month = calendar.monthrange(year, month)

                # Create list of 6 date objects, month_sundays, representing week1, week2, ... week6 Sunday dates
                # If a week has no Sunday, it is None
                day_countup = 1
                day_countup += 6 - first_day_in_month
                month_sundays = []
                if first_day_in_month != 6:
                    month_sundays.append(None)
                while day_countup <= num_days_in_month:
                    month_sundays.append(datetime.date(year, month, day_countup))
                    day_countup += 7
                while len(month_sundays) < 6:
                    month_sundays.append(None)
                christmas_eve_date = datetime.date(year, month, 24)

        # Second pick off line at front of file indicating worship service time that this attendance file is for...
        elif not matched_service_time:
            matched_service_time = re.search("Worship Service - (Sunday |Summer )?([^ ]*)", line)
            if matched_service_time:
                service_time = matched_service_time.group(2)
                if service_time in event_ids:
                    event_id = event_ids[service_time]
                    event_name = event_names[event_id]
                else:
                    print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                    print >> sys.stderr, '*** ERROR! Unrecognized service_time: "' + service_time + '"'
                    print >> sys.stderr
                    sys.exit(1)

        # ...then match attendance (row per person with weeks they attended) and total (summary at bottom) rows
        else:

            # Once we found row with totals...we're done, that's last line in attendance file we need to parse
            matched_total_line = re.search("^ {18}Total: {13}(?P<attendance>( +[0-9]+)+)\r?$", line)
            if matched_total_line:
                totals_attendance_dict = attendance_str2dict(
                    matched_total_line.group("attendance"), [-3, -9, -15, -20, -24, -29, -35], 3
                )
                break

            matched_attendance_line = re.search(
                "^ {6}"
                + "(?P<full_name>(?P<last_name>[A-Za-z]+([ \-'][A-Za-z]+)*), "
                + "(?P<first_name>([A-Za-z]+\.?)+([\-' ][A-Za-z]+)*)( \((?P<nick_name>[A-Za-z]+)\))?\.?)?\r?"
                + "(?P<phone>( +)?([0-9]{3}-[0-9]{3}-[0-9]{4}|Unlisted))?"
                + "(?P<attendance> +(1 +)+[1-6])?\r?$",
                line,
            )
            if matched_attendance_line:
                if matched_attendance_line.group("full_name"):
                    full_name = matched_attendance_line.group("full_name").strip()
                if matched_attendance_line.group("phone"):
                    phone = matched_attendance_line.group("phone").strip()
                if matched_attendance_line.group("attendance"):
                    if full_name:
                        attendance = matched_attendance_line.group("attendance").strip()
                        row_dict = attendance_str2dict(attendance, [-1, -7, -13, -18, -22, -27, -33], 1)
                        row_dict["full_name"] = full_name
                        if phone:
                            row_dict["phone"] = phone
                        else:
                            row_dict["phone"] = ""
                        num_processed_lines += 1
                        full_name = None
                        phone = None
                        if row_dict["total"] != (
                            row_dict["week1"]
                            + row_dict["week2"]
                            + row_dict["week3"]
                            + row_dict["week4"]
                            + row_dict["week5"]
                            + row_dict["week6"]
                        ):
                            print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number)
                            print >> sys.stderr, "*** ERROR! Bad row total, doesn't match sum of weeks 1-6"
                            print >> sys.stderr, row_dict
                            print >> sys.stderr
                            break

                        for key in accumulated_row_totals_dict:
                            accumulated_row_totals_dict[key] += row_dict[key]
                        attendance_dicts.append(row_dict)

            # Buffer the current line for line folding if needed (see 'line folding' above)
            prior_line = line
            line_number += 1

    print "*** Number of attendance lines processed: " + str(num_processed_lines)
    print "*** Number of attendees: " + str(accumulated_row_totals_dict["total"])
    print

    if output_csv_filebase and event_id:
        output_csv_filename = (
            output_csv_filebase + "/" + str(year) + format(month, "02d") + "_" + str(event_names[event_id]) + ".csv"
        )
        all_columns_table = petl.fromdicts(attendance_dicts)
        petl.tocsv(all_columns_table, output_csv_filename)

    # Build 2nd list of dicts, where each list item is dict of individual date/event attendance.  I.e. a row per
    # worship service date vs original attendance dicts format of a row per attendee across all weeks in month.
    # This is the actual one returned and eventually emitted into output file
    attendance_dicts2 = []
    for attendance_dict in attendance_dicts:
        for key in attendance_dict:
            if key[:4] == "week" and attendance_dict[key] != 0:
                week_index = int(key[4:5]) - 1
                if month_sundays[week_index] is not None:
                    attendance_dict2 = {}
                    full_name = attendance_dict["full_name"]
                    if full_name in full_name2sk_indiv_id:
                        attendance_dict2["Individual ID"] = full_name2sk_indiv_id[full_name]
                        if event_name == "Christmas Eve":
                            attendance_dict2["Date"] = christmas_eve_date
                        else:
                            attendance_dict2["Date"] = month_sundays[week_index]
                        attendance_dict2["Event ID"] = event_id
                        if add_extra_fields:
                            attendance_dict2["Time"] = event_times[event_id]
                            attendance_dict2["Full Name"] = full_name
                            attendance_dict2["Event Name"] = event_name
                            attendance_dict2["Week Num"] = week_index + 1
                        attendance_dicts2.append(attendance_dict2)
                    else:
                        print >> sys.stderr, '*** WARNING! Cannot find "' + full_name + '" in map'
                        print >> sys.stderr
                else:
                    print >> sys.stderr, '*** WARNING! Cannot find Sunday date for week index "' + str(week_index) + '"'
                    print >> sys.stderr

    # Check if numbers on Servant Keeper's reported Total: line match the totals we've been accumulating
    # per attendance row entry.  If they don't match, show WARNING (not ERROR, since via manual checks, it appears
    # that Servant Keeper totals are buggy)
    if totals_attendance_dict:
        for key in accumulated_row_totals_dict:
            if accumulated_row_totals_dict[key] != totals_attendance_dict[key]:
                pp = pprint.PrettyPrinter(stream=sys.stderr)
                print >> sys.stderr, "*** WARNING! Servant Keeper reported totals do not match data totals"
                print >> sys.stderr, "Servant Keeper Totals:"
                pp.pprint(totals_attendance_dict)
                print >> sys.stderr, "Data Totals:"
                pp.pprint(accumulated_row_totals_dict)
                print >> sys.stderr
                break

    return_table = petl.fromdicts(attendance_dicts2)
    header = petl.header(return_table)
    if "Event Name" in header:
        return_table = petl.cut(
            return_table, "Full Name", "Event Name", "Time", "Week Num", "Date", "Event ID", "Individual ID"
        )
    else:
        return_table = petl.cut(return_table, "Date", "Event ID", "Individual ID")

    return return_table
def log_sample_table(stream):
    """Record a tabular representation of the stream sample to the log."""

    samples = list(map(lambda x: dict(zip(stream.headers, x)), stream.sample))
    table = look(fromdicts(samples), limit=len(stream.sample))
    info('Data sample =\n%s', table)
    def _get_field_sample(resource_sample, field):
        """Return a subset of the relevant data column."""

        sample_table = petl.fromdicts(resource_sample)
        sample_column = list(petl.values(sample_table, field['name']))
        return sample_column
Esempio n. 51
0
def getTable(path):
    return etl.fromdicts(loadData(path))
Esempio n. 52
0
File: json.py Progetto: DeanWay/petl
'''
with open('example.json', 'w') as f:
    f.write(data)

table1 = etl.fromjson('example.json')
table1


# fromdicts()
#############

import petl as etl
dicts = [{"foo": "a", "bar": 1},
         {"foo": "b", "bar": 2},
         {"foo": "c", "bar": 2}]
table1 = etl.fromdicts(dicts)
table1


# tojson()
##########

import petl as etl
table1 = [['foo', 'bar'],
          ['a', 1],
          ['b', 2],
          ['c', 2]]
etl.tojson(table1, 'example.json', sort_keys=True)
# check what it did
print(open('example.json').read())