Beispiel #1
0
    def etl_from_dir(self, data_dir="data"):
        """Extract, translate, load reinstatements (and not exclusions) from
        directory DATA_DIR.
        """

        # Get YYYYMM date of most recent reinstatement action
        most_recent = self.conn.get_latest_reinstatement_date().replace(
            '-', '')[:6] or "000000000"

        # Get the data from REIN CSV files.  Gather reinstatement actions
        # since most_recent
        total_indiv = []
        total_bus = []
        for fname in sorted(glob.glob(os.path.join(data_dir, "*REIN.csv"))):
            if int(os.path.basename(fname)[:4]) <= int(most_recent[2:]):
                continue
            debug("Processing " + fname)
            reinstated = etl.fromcsv(fname)
            individual, business = clean_and_separate(reinstated)
            total_indiv.append(individual)
            total_bus.append(business)

        # Save to db, APPENDING TO existing data tables.  Assumes tables
        # exist.
        if total_indiv:
            etl.appenddb(etl.cat(*total_indiv), self.conn.conn,
                         'individual_reinstatement')
        if total_bus:
            etl.appenddb(etl.cat(*total_bus), self.conn.conn,
                         'business_reinstatement')

        # It is possible to end up with duplicate rows if, say, an ETL
        # process is interrupted midway through.  So we should find and
        # remove dupes.
        self.conn.dedupe_reinstatements()
Beispiel #2
0
def cat(data, source, destination, header, missing):
    """Return the first n rows of the data table and store them in the destination data table."""
    sources = [data.get(s) for s in source]
    if len(header) == 0:
        n = petl.cat(*sources, missing=missing)
    else:
        n = petl.cat(*sources, header=header, missing=missing)
    data.set(destination, n)
Beispiel #3
0
    def transform_resource(self, source, target):
        if isinstance(self.__resource, str):
            self.__resource = source.package.get_resource(self.__resource)
        self.__resource.infer(only_sample=True)
        view1 = source.to_petl()
        view2 = self.__resource.to_petl()

        # Ignore fields
        if self.__ignore_fields:
            target.data = petl.stack(view1, view2)
            for field in self.__resource.schema.fields[len(target.schema.fields
                                                           ):]:
                target.schema.add_field(field)

        # Default
        else:
            if self.__sort:
                target.data = petl.mergesort(view1,
                                             view2,
                                             key=self.__sort,
                                             header=self.__field_names)
            else:
                target.data = petl.cat(view1, view2, header=self.__field_names)
            for field in self.__resource.schema.fields:
                if field.name not in target.schema.field_names:
                    target.schema.add_field(field)
            if self.__field_names:
                for field in list(target.schema.fields):
                    if field.name not in self.__field_names:
                        target.schema.remove_field(field.name)
Beispiel #4
0
    def transform_resource(self, resource):
        target = resource
        source = self.get("resource")
        field_names = self.get("fieldNames")
        ignore_fields = self.get("ignoreFields")
        sort_by_field = self.get("sortByField")
        if isinstance(source, str):
            source = target.package.get_resource(source)
        source.infer()
        view1 = target.to_petl()
        view2 = source.to_petl()

        # Ignore fields
        if ignore_fields:
            for field in source.schema.fields[len(target.schema.fields):]:
                target.schema.add_field(field)
            resource.data = petl.stack(view1, view2)

        # Default
        else:
            for field in source.schema.fields:
                if field.name not in target.schema.field_names:
                    target.schema.add_field(field)
            if field_names:
                for field in list(target.schema.fields):
                    if field.name not in field_names:
                        target.schema.remove_field(field.name)
            if sort_by_field:
                key = sort_by_field
                resource.data = petl.mergesort(view1,
                                               view2,
                                               key=key,
                                               header=field_names)
            else:
                resource.data = petl.cat(view1, view2, header=field_names)
Beispiel #5
0
    def _get_actors(self):
        student_actors = (
            etl
            .fromcsv(f'{self._dirc}/student.csv', delimiter=';')
            .cut('id')
            .addcolumn('role', [], missing='Student')
        )

        teacher_actors = (
            etl
            .fromcsv(f'{self._dirc}/teacher.csv', delimiter=';')
            .cut('id')
            .addcolumn('role', [], missing='Teacher')
        )

        # team_user_actors = (
        #     etl
        #     .fromcsv(f'{self._dirc}/team_allocation.csv', delimiter=';')
        #     .cut('team_id')
        #     .rename('team_id', 'id')
        #     .convert('id', str)
        #     .addcolumn('role', [], missing='Team')
        #     .convert('role', str)
        # )

        return etl.cat(student_actors, teacher_actors)        
    def _get_participants(self):
        student_actors = (etl.fromcsv(f'{self._dirc}/student.csv',
                                      delimiter=';').cut('id'))

        teacher_actors = (etl.fromcsv(f'{self._dirc}/teacher.csv',
                                      delimiter=';').cut('id'))
        return etl.cat(student_actors, teacher_actors)
Beispiel #7
0
    def _get_actor_pariticipants(self):
        student_actors = (
            etl
            .fromcsv(f'{self._dirc}/student.csv', delimiter=';')
            .cut('id', 'id')
            .rename(0, 'actor_id')
            .rename(1, 'participant_id')
        )

        teacher_actors = (
            etl
            .fromcsv(f'{self._dirc}/teacher.csv', delimiter=';')
            .cut('id', 'id')
            .rename(0, 'actor_id')
            .rename(1, 'participant_id')
        )

        # team_user_actors = (
        #     etl
        #     .fromcsv(f'{self._dirc}/team_allocation.csv', delimiter=';')
        #     .cut('team_id', 'student_id')
        #     .rename('team_id', 'actor_id')
        #     .convert('actor_id', str)
        #     .rename('student_id', 'participant_id')
        #     .convert('participant_id', str)
        # )
        return etl.cat(student_actors, teacher_actors)
Beispiel #8
0
    def test_concat(self):
        tbl1 = self.tbl
        tbl2 = Table([{'first': 'Mary', 'last': 'Nichols'}])
        tbl3 = Table([{'first': 'Lucy', 'last': 'Peterson'}])
        tbl1.concat(tbl2, tbl3)

        expected_tbl = Table(petl.cat(tbl1.table, tbl2.table, tbl3.table))
        assert_matching_tables(expected_tbl, tbl1)
Beispiel #9
0
    def _get_actor_pariticipants(self):
        # wait to clarify issue in get_actors
        student_actors = (etl.fromcsv(f'{self._dirc}/student.csv',
                                      delimiter=';').cut('id', 'id').rename(
                                          0, 'actor_id').rename(
                                              1, 'participant_id'))

        teacher_actors = (etl.fromcsv(f'{self._dirc}/teacher.csv',
                                      delimiter=';').cut('id', 'id').rename(
                                          0, 'actor_id').rename(
                                              1, 'participant_id'))

        return etl.cat(student_actors, teacher_actors)
Beispiel #10
0
    def _get_actors(self):
        # TODO: Deal with both
        student_actors = (etl.fromcsv(
            f'{self._dirc}/all_users.csv',
            delimiter=';').select(lambda row: row.role == 'STUDENT').cut(
                'id').addcolumn('role', [], missing='Student'))

        teacher_actors = (etl.fromcsv(
            f'{self._dirc}/all_users.csv',
            delimiter=';').select(lambda row: row.role == 'TEACHER').cut(
                'id').addcolumn('role', [], missing='Teacher'))

        return etl.cat(student_actors, teacher_actors)
Beispiel #11
0
    def from_s3_csv(cls,
                    bucket,
                    key,
                    from_manifest=False,
                    aws_access_key_id=None,
                    aws_secret_access_key=None,
                    **csvargs):
        """
        Create a ``parsons table`` from a key in an S3 bucket.

        `Args:`
            bucket: str
                The S3 bucket.
            key: str
                The S3 key
            from_manifest: bool
                If True, treats `key` as a manifest file and loads all urls into a `parsons.Table`.
                Defaults to False.
            aws_access_key_id: str
                Required if not included as environmental variable.
            aws_secret_access_key: str
                Required if not included as environmental variable.
            \**csvargs: kwargs
                ``csv_reader`` optional arguments
        `Returns:`
            `parsons.Table` object
        """  # noqa: W605

        from parsons.aws import S3
        s3 = S3(aws_access_key_id, aws_secret_access_key)

        if from_manifest:
            with open(s3.get_file(bucket, key)) as fd:
                manifest = json.load(fd)

            s3_keys = [x["url"] for x in manifest["entries"]]

        else:
            s3_keys = [f"s3://{bucket}/{key}"]

        tbls = []
        for key in s3_keys:
            # TODO handle urls that end with '/', i.e. urls that point to "folders"
            _, _, bucket_, key_ = key.split("/", 3)
            file_ = s3.get_file(bucket_, key_)
            if files.compression_type_for_path(key_) == 'zip':
                file_ = files.zip_archive.unzip_archive(file_)

            tbls.append(petl.fromcsv(file_, **csvargs))

        return cls(petl.cat(*tbls))
Beispiel #12
0
    def _get_actors(self):
        student_actors = (etl.fromcsv(
            f'{self._dirc}/all_users.csv',
            delimiter=',').select(lambda row: row.role != 'TEACHER').rename(
                'user_id', 'id').cut('id').addcolumn('role', [],
                                                     missing='student'))

        teacher_actors = (etl.fromcsv(
            f'{self._dirc}/all_users.csv',
            delimiter=',').select(lambda row: row.role == 'TEACHER').rename(
                'user_id', 'id').cut('id').addcolumn('role', [],
                                                     missing='instructor'))

        return etl.cat(student_actors, teacher_actors)
Beispiel #13
0
def test_toxlsx_appendxlsx(xlsx_test_table):

    # setup
    f = NamedTemporaryFile(delete=True, suffix='.xlsx')
    f.close()

    # test toxlsx
    toxlsx(xlsx_test_table, f.name, 'Sheet1')
    actual = fromxlsx(f.name, 'Sheet1')
    ieq(xlsx_test_table, actual)

    # test appendxlsx
    appendxlsx(xlsx_test_table, f.name, 'Sheet1')
    expect = etl.cat(xlsx_test_table, xlsx_test_table)
    ieq(expect, actual)
Beispiel #14
0
def test_appendxlsx_with_non_str_header(xlsx_table_with_non_str_header,
                                        xlsx_test_table):

    f = NamedTemporaryFile(delete=True, suffix='.xlsx')
    f.close()

    # write first table
    toxlsx(xlsx_test_table, f.name, 'Sheet1')
    actual = fromxlsx(f.name, 'Sheet1')
    ieq(xlsx_test_table, actual)

    # test appendxlsx
    appendxlsx(xlsx_table_with_non_str_header, f.name, 'Sheet1')
    expect = etl.cat(xlsx_test_table, xlsx_table_with_non_str_header)
    ieq(expect, actual)
Beispiel #15
0
    def __init__(self, fasta_path, gff3_path, seqid=None):
        """
        An annotated reference genome.

        Parameters
        ----------

        fasta_path : string
            Path to reference genome FASTA file.
        gff3_path : string
            Path to genome annotations GFF3 file.

        """

        # store initialisation parameters
        self._fasta_path = fasta_path
        self._gff3_path = gff3_path
        self._seqid = seqid

        # setup access to reference sequence
        self._fasta = pyfasta.Fasta(fasta_path)

        # setup access to GFF3 as a table
        if isinstance(gff3_path, (list, tuple)):
            tbl_features = etl.cat(*[etl.fromgff3(p) for p in gff3_path])
        else:
            tbl_features = etl.fromgff3(gff3_path)
        tbl_features = (tbl_features.unpackdict(
            'attributes', ['ID', 'Parent']).rename({
                'ID': 'feature_id',
                'Parent': 'parent_id',
                'end': 'stop'
            }).select(lambda row: (row.stop - row.start) > 0))

        # limit data to a single chromosome
        if seqid is not None:
            tbl_features = tbl_features.eq('seqid', seqid)
        self._tbl_features = tbl_features.cache()

        # index features by ID
        self._idx_feature_id = self._tbl_features.recordlookupone('feature_id')

        # index features by parent ID
        self._idx_parent_id = self._tbl_features.recordlookup('parent_id')

        # index features by genomic location
        self._idx_location = self._tbl_features.facetintervalrecordlookup(
            'seqid', 'start', 'stop', include_stop=True)
Beispiel #16
0
def sync(ctx: typer.Context,
         project: str = typer.Argument(
             ..., help='The name for the project, specified in config file'),
         since: datetime = typer.Option(..., formats=['%Y-%m-%d']),
         until: datetime = typer.Option(..., formats=['%Y-%m-%d']),
         dry: bool = typer.Option(
             False,
             help='Use log entries instead of uploading them to redmine'),
         drain: bool = typer.Option(
             False,
             help='Use drain issues for entries without specified dest')):
    config = setup_config(ctx, ctx.meta['config_path'])
    setup_http(ctx)

    ctx.meta['rdm_user'] = extract.get_redmine_user(config["redmine"]["url"])

    time_entries = get_toggl_enteries(config, project, since, until)

    issues = get_redmine_issues(config, project, since)

    issue_ids = petl.columns(issues)['id']
    entries_to_load, unset_entries = petl.biselect(
        time_entries, lambda row: row['issue_id'] in issue_ids)

    if drain and petl.nrows(unset_entries):
        log.info('Using drain')

        drained, unset_entries = drained_entries(ctx, issues, unset_entries,
                                                 project)

        log.info(f'Drained {petl.nrows(drained)} issues')

        entries_to_load = petl.cat(entries_to_load, drained)

    if petl.nrows(unset_entries):
        log.warning(f'There\'re {petl.nrows(unset_entries)} unset entries')

    if get_proj_attr(config, project, 'group_entries'):
        log.info('Using group by day and description')

        entries_to_load = transform.group_entries_by_day(entries_to_load)

    load.to_redmine_time(config["redmine"]["url"],
                         entries_to_load,
                         activity_id=get_proj_attr(config, project,
                                                   'rdm_activity_id'),
                         user_id=ctx.meta['rdm_user'].get('id'),
                         dry=dry)
Beispiel #17
0
    def test_toxlsx_appendxlsx():

        # setup
        tbl = (('foo', 'bar'), ('A', 1), ('B', 2), ('C', 2),
               (u'é', datetime(2012, 1, 1)))
        f = NamedTemporaryFile(delete=True, suffix='.xlsx')
        f.close()

        # test toxlsx
        toxlsx(tbl, f.name, 'Sheet1')
        actual = fromxlsx(f.name, 'Sheet1')
        ieq(tbl, actual)

        # test appendxlsx
        appendxlsx(tbl, f.name, 'Sheet1')
        expect = etl.cat(tbl, tbl)
        ieq(expect, actual)
Beispiel #18
0
def join_tables(filename_pattern_list, output_csv_filebase, add_extra_fields):
    curr_table = None
    filenames_list = []
    for filename_pattern in filename_pattern_list:
        for filename in glob.glob(filename_pattern):
            filenames_list.append(filename)

    for filename in sorted(set(filenames_list)):
        if not os.path.isfile(filename):
            print >> sys.stderr, "*** Error! Cannot open file '" + filename + "'"
            print >> sys.stderr
        else:
            next_table = attendance_file2table(filename, output_csv_filebase, add_extra_fields)
            if curr_table is not None:
                curr_table = petl.cat(curr_table, next_table)
            else:
                curr_table = next_table

    return curr_table
def join_tables(filename_pattern_list, output_csv_filebase, add_extra_fields):
    curr_table = None
    filenames_list = []
    for filename_pattern in filename_pattern_list:
        for filename in glob.glob(filename_pattern):
            filenames_list.append(filename)

    for filename in sorted(set(filenames_list)):
        if not os.path.isfile(filename):
            print >> sys.stderr, "*** Error! Cannot open file '" + filename + "'"
            print >> sys.stderr
        else:
            next_table = attendance_file2table(filename, output_csv_filebase,
                                               add_extra_fields)
            if curr_table is not None:
                curr_table = petl.cat(curr_table, next_table)
            else:
                curr_table = next_table

    return curr_table
Beispiel #20
0
    def concat(self, *tables, missing=None):
        """
        Concatenates one or more tables onto this one.

        Note that the tables do not need to share exactly the same fields.
        Any missing fields will be padded with None, or whatever is provided via the
        ``missing`` keyword argument.

        `Args:`
            tables: Parsons Table or list
                A single table, or a list of tables
            missing: bool
                The value to use when padding missing values
        `Returns:`
            ``None``
        """

        if type(tables) not in [list, tuple]:
            tables = [tables]
        petl_tables = [tbl.table for tbl in tables]

        self.table = petl.cat(self.table, *petl_tables, missing=missing)
Beispiel #21
0
print("B only rows: {}".format(b_only))

# Export missing locations to csv
if a_only > 0:
    locs_only_in_a.tocsv('missing_locations_a.csv')
else:
    locs_only_in_b.tocsv('missing_locations_b.csv')

# find conflicts between A/B on Chr and Pos columns
ab_merge = etl.merge(a_conv, b_conv, key=('Chr', 'Pos'))
# magic command for IPython display
# ab_merge.display(caption='ab_merge',
#                  td_styles=lambda v: highlight if isinstance(v, etl.Conflict) else '')

# Create a new list of all conflicting values
ab = etl.cat(a_conv.addfield('source', 'a', index=0),
             b_conv.addfield('source', 'b', index=0))
ab_conflicts = ab.conflicts(key=('Chr', 'Pos'), exclude='source')

# magic command for IPython display
# ab_conflicts.display(10)

# Highlight specific conflicts
ab_conflicts_mut = ab.conflicts(key=('Chr', 'Pos'), include='Mut')

# magic command for IPython display
# ab_conflicts_mut.display(10, caption='Mut conflicts',
# td_styles={'Mut': highlight})
ab_conflict_num = ab_conflicts_mut.nrows()

if _DEBUG:
    print("Total number of A/B conflicts: {}".format(ab_conflict_num))
########## Json extraction and maping
tableJ = etl.fromjson('cust_data.json', header=['id','gender','first_name','last_name', 'email','ville'])
tableJ = etl.movefield(tableJ, 'gender', 4)

########## CSV extraction and conversion
tableCSV = etl.fromcsv('week_cust.csv')
tableCSV = etl.convert(tableCSV, 'id', int)

########### Sqlserver connection and extraction
connectionSqlServer=pyodbc.connect("Driver={SQL Server Native Client 11.0};" "Server=81_64_msdn;" "Database=BD4client;" "Trusted_Connection=yes;" "convert_unicode =True;")
cursor = connectionSqlServer.cursor()
cursor.execute('SELECT id, first_name, last_name, email, gender, ville FROM client_DATA')
tableSqlServer = cursor.fetchall()
tableSqlServer =[('id','first_name','last_name', 'email','gender','ville')]+tableSqlServer
cursor.close()
connectionSqlServer.close()

######### Staging area transforming and concatenation
StagingArea = etl.cat(tableCSV, tableJ,tableSqlServer)
StagingArea = etl.convert(StagingArea, 'gender', {'Male': 'M', 'Female': 'F', 'male': 'M', 'female': 'F', None: 'N'})
StagingArea = etl.rename(StagingArea, 'ville', 'city')

######## mysql
connection = mysql.connect(host="localhost", user="******", passwd="", db="customerdatabase")
curseur = connection.cursor()
curseur.execute('SET SQL_MODE=ANSI_QUOTES')
#### load data, assuming table " CustomerData" already exists in the database
etl.appenddb(StagingArea, connection, 'customerdata', schema='customerdatabase', commit='commit')
curseur.close()
connection.close()
Beispiel #23
0
          ['D', 'xyz', 9.0],
          ['E', None]]
table5 = [['bar', 'foo'],
          ['A', 1],
          ['B', 2]]
table7 = [['bar', 'foo'],
          ['A', 1],
          ['B', 2]]
table8 = [['bar', 'baz'],
          ['C', True],
          ['D', False]]

from petl import look, cat
look(table1)
look(table2)
table3 = cat(table1, table2)
look(table3)
# can also be used to square up a single table with uneven rows
look(table4)
look(cat(table4))
# use the header keyword argument to specify a fixed set of fields 
look(table5)
table6 = cat(table5, header=['A', 'foo', 'B', 'bar', 'C'])
look(table6)
# using the header keyword argument with two input tables
look(table7)
look(table8)
table9 = cat(table7, table8, header=['A', 'foo', 'B', 'bar', 'C'])
look(table9)

Beispiel #24
0
          ['D', 'xyz', 9.0],
          ['E', None]]
table5 = [['bar', 'foo'],
          ['A', 1],
          ['B', 2]]
table7 = [['bar', 'foo'],
          ['A', 1],
          ['B', 2]]
table8 = [['bar', 'baz'],
          ['C', True],
          ['D', False]]

from petl import look, cat
look(table1)
look(table2)
table3 = cat(table1, table2)
look(table3)
# can also be used to square up a single table with uneven rows
look(table4)
look(cat(table4))
# use the header keyword argument to specify a fixed set of fields 
look(table5)
table6 = cat(table5, header=['A', 'foo', 'B', 'bar', 'C'])
look(table6)
# using the header keyword argument with two input tables
look(table7)
look(table8)
table9 = cat(table7, table8, header=['A', 'foo', 'B', 'bar', 'C'])
look(table9)

Beispiel #25
0
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined,
                                             non_unique_parcel_id_rows,
                                             key='parcel_id').addfield(
                                                 'reason',
                                                 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ",
      etl.nrows(dor_condos_unjoined_unmatched))
if DEV:
    print(etl.look(dor_condos_unjoined_unmatched))
dor_condos_unjoined_duplicates = etl.antijoin(
    unjoined, dor_condos_unjoined_unmatched,
    key='source_object_id').addfield('reason',
                                     'non-unique active/remainder mapreg')
print("non-unique active/remainder mapreg error rowcount: ",
      etl.nrows(dor_condos_unjoined_duplicates))
if DEV:
    print(etl.look(dor_condos_unjoined_duplicates))
error_table = etl.cat(dor_condos_unjoined_unmatched,
                      dor_condos_unjoined_duplicates)
if DEV:
    print(etl.look(error_table))

# Write to engine db
if not DEV:
    print('Writing condos...')
    joined.todb(pg_db, 'dor_condominium')
    print('Writing errors...')
    error_table.todb(pg_db, 'dor_condominium_error')

print("Completed in ", datetime.now() - start, " minutes.")
Beispiel #26
0
]

table1 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)),
    "Type", "Red")
table2 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)),
    "Type", "White")

#print(etl.head(table1))
#print(etl.head(table2))

table1_filtered = etl.select(table1, "Quality", lambda v: v > 6)
table2_filtered = etl.select(table2, "Quality", lambda v: v > 4)

good_wines = etl.cat(table1_filtered, table2_filtered)

good_wines_enhanced = etl.addfields(
    good_wines,
    [("Max Acidity",
      lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]),
     ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])])
#print(etl.head(good_wines_enhanced))
#print(etl.tail(good_wines_enhanced))

gwe_sorted = etl.sort(good_wines_enhanced, key=["Quality", "Sugar"])

#print(etl.head(gwe_sorted))
print(etl.lookall(etl.tail(gwe_sorted, 500)))
Beispiel #27
0
    def download(self, urls=[]):

        # timeout setting for requests
        # timeout = urllib3.Timeout(connect=2.0, read=7.0)
        # http = urllib3.PoolManager(timeout=timeout)
        http = urllib3.PoolManager()

        report_data = []

        for url in urls:

            # print(url)

            report_filename = self.get_report_filename(
                hashlib.md5(url).hexdigest())

            if cache:
                # print('use cache')
                cache_key = url
                cache_timeout = CONNECTOR_INFO.get('report_cache_timeout',
                                                   60 * 60)

                z_report = cache.get(cache_key)
                if z_report is not None:

                    new_report_data = petl.io.fromcsv(
                        petl.MemorySource(zlib.decompress(z_report)))

                    # print(len(new_report_data))

                    if not report_data:
                        # print('NEw cat')
                        report_data = new_report_data
                    else:
                        report_data = petl.cat(report_data, new_report_data)

                    continue

                logging.info('Download Report from {}'.format(url))

                r = http.request('GET',
                                 url,
                                 retries=urllib3.Retry(
                                     redirect=2,
                                     backoff_factor=2,
                                 ))
                if r.status == 200:
                    report = r.data
                    r.release_conn()

                    z_report = zlib.compress(report)
                    cache.set(cache_key, z_report, timeout=cache_timeout)

                    # return petl.io.fromcsv(petl.MemorySource(report))

                    new_report_data = petl.io.fromcsv(
                        petl.MemorySource(report))
                    # print(len(new_report_data))
                    if not report_data:
                        report_data = new_report_data
                    else:
                        report_data = petl.cat(report_data, new_report_data)
                elif r.status == 403:
                    raise Exception(r.data)
                else:
                    logging.info(r.data)
                    logging.info(r.status)
                    logging.info(r.headers)

            else:
                # move to init

                # print('Not cache')
                if not os.path.exists(self.report_folder):
                    os.makedirs(self.report_folder)

                if not os.path.exists(report_filename):
                    logging.info('Download Report from {}'.format(url))

                    r = http.request('GET',
                                     url,
                                     retries=urllib3.Retry(
                                         redirect=2,
                                         backoff_factor=2,
                                     ))
                    if r.status == 200:
                        with open(report_filename, 'wb') as f:
                            f.write(r.data)
                        r.release_conn()

                        logging.info('Read from {}'.format(report_filename))

                        new_report_data = petl.io.fromcsv(report_filename)

                        if not report_data:
                            report_data = new_report_data
                        else:
                            report_data = petl.cat(report_data,
                                                   new_report_data)
        return report_data
def add_rows(table, list_rows_to_add):
    add_table = petl.fromdicts(list_rows_to_add)
    return petl.cat(table, add_table)
Beispiel #29
0
def prepare_calls(calls_file_path, output_dir, food_needs_user,
                  complex_needs_user, simple_needs_user, call_log_review_user):
  """Prepares call log records for import"""

  # Expected file is in 'windows-1252' file encoding
  spreadsheet = (
    etl.fromcsv(calls_file_path, encoding='windows-1252')
    .rename(rename_map)
    .select(lambda row: row['latest_attempt_date'])
    .addfield('import_data', partial(serialize_row, keys=header_map.keys()))
    .convert('latest_attempt_date', parse_date)
    .addfield('created_at', lambda row: row['latest_attempt_date'])
    .addfield('updated_at', lambda row: row['latest_attempt_date'])
  )

  needs_fields = ['nhs_number', 'category', 'name', 'created_at', 'updated_at']
  notes_fields = ['nhs_number', 'category', 'body', 'created_at', 'updated_at']

  original_triage_needs = (
    spreadsheet
    .addfield('category', 'phone triage')
    .addfield('name', MSG_ORIGINAL_TRIAGE_NEED)
    .addfield('completed_on', determine_triage_completion)
    .cut(*needs_fields, 'completed_on')
  )

  generated_header = ['nhs_number', 'created_at', 'updated_at', 'category']
  original_triage_call_notes = (
    spreadsheet
    .selectnotnone('was_contact_made')
    .rowmapmany(generate_call_notes, header=generated_header)
    .addfield('body', MSG_CALL_LOG_NOTE)
    .cut(*notes_fields)
  )

  original_triage_import_notes = (
    spreadsheet
    .addfield('category', 'phone_import')
    .addfield('body', partial(compose_body, fields=header_map))
    .cut(*notes_fields, 'import_data')
  )

  food_needs = (
    spreadsheet
    .select(needs_food)
    .addfield('category', 'groceries and cooked meals')
    .convert('food_priority', parse_food_priority)
    .addfield('supplemental_data', construct_supplemental_data)
    .addfield('completed_on', determine_food_completion)
    .addfield('user_id', food_needs_user)
    .addfield('name', partial(compose_food_need_desc, fields=header_map))
    .cut(*needs_fields, 'completed_on', 'supplemental_data', 'user_id')
  )

  callback_needs = (
    spreadsheet
    .convert('callback_date', parse_callback_date)
    .select(needs_callback)
    .addfield('category', 'phone triage')
    .addfield('name', partial(compose_callback_need_desc, fields=header_map))
    .addfield('start_on', determine_callback_start_date)
    .cut(*needs_fields, 'start_on')
  )

  prescription_needs = (
    spreadsheet
    .select(lambda row: row['addl_medication_prescriptions'])
    .addfield('category', 'prescription pickups')
    .addfield('name', partial(compose_other_need_desc, fields=header_map))
    .addfield('user_id', simple_needs_user)
    .cut(*needs_fields, 'user_id')
  )

  mental_wellbeing_needs = (
    spreadsheet
    .select(lambda row: row['addl_mental_wellbeing'])
    .addfield('category', 'physical and mental wellbeing')
    .addfield('name', partial(compose_other_need_desc, fields=header_map))
    .addfield('user_id', complex_needs_user)
    .cut(*needs_fields, 'user_id')
  )

  financial_needs = (
    spreadsheet
    .select(lambda row: row['addl_financial'])
    .addfield('category', 'financial support')
    .addfield('name', partial(compose_other_need_desc, fields=header_map))
    .addfield('user_id', complex_needs_user)
    .cut(*needs_fields, 'user_id')
  )

  other_needs = (
    spreadsheet
    .select(needs_other_support)
    .addfield('category', 'other')
    .addfield('name', partial(compose_other_need_desc, fields=header_map))
    .addfield('user_id', partial(determine_other_need_user,
                                 complex_needs_user=complex_needs_user,
                                 simple_needs_user=simple_needs_user,
                                 call_log_review_user=call_log_review_user))
    .cut(*needs_fields, 'user_id')
  )

  # TODO: prefix with [Import]
  contact_profile_updates = (
    spreadsheet
    .addfield('additional_info', partial(compose_additional_info, fields=header_map))
    .addfield('delivery_details', partial(compose_delivery_details, fields=header_map))
    .addfield('dietary_details', compose_dietary_details)
    .convert('has_covid_symptoms', parse_covid_symptoms)
    .cut('nhs_number',
         'additional_info',
         'delivery_details',
         'dietary_details',
         'has_covid_symptoms')
  )

  # TODO: Improve implementation and readability of QA bits (currently loads all tables into memory)
  # TODO: Consider adding stats to output somewhere for additional QA
  lookups = {
    'original_triage_needs': original_triage_needs.dictlookupone('nhs_number'),
    'original_triage_call_notes': original_triage_call_notes.dictlookup('nhs_number'), # returns list
    'food_needs': food_needs.dictlookupone('nhs_number'),
    'callback_needs': callback_needs.dictlookupone('nhs_number'),
    'remaining_needs': etl.cat(prescription_needs,
                               mental_wellbeing_needs,
                               financial_needs,
                               other_needs).dictlookup('nhs_number') # returns list
  }
  quality_assurance = (
    spreadsheet
    .addfield('call_log', partial(compose_body, fields=header_map))
    .addfield('original_triage_status', partial(qa_original_triage_status, lookups['original_triage_needs']))
    .addfield('original_triage_call_notes', partial(qa_original_triage_call_notes, lookups['original_triage_call_notes']))
    .addfield('food_need', partial(qa_food_need, lookups['food_needs']))
    .addfield('callback_need', partial(qa_callback_need, lookups['callback_needs']))
    .addfield('remaining_needs', partial(qa_remaining_needs, lookups['remaining_needs']))
    .cut('nhs_number',
         'latest_attempt_date',
         'original_triage_status',
         'original_triage_call_notes',
         'food_need',
         'callback_need',
         'remaining_needs',
         'call_log')
  )

  # Write files
  quality_assurance.tocsv(join(output_dir, 'quality_assurance.csv'))
  contact_profile_updates.tocsv(join(output_dir, 'contact_profile_updates.csv'))
  original_triage_needs.tocsv(join(output_dir, 'original_triage_needs.csv'))

  etl.cat(original_triage_import_notes, original_triage_call_notes) \
     .tocsv(join(output_dir, 'original_triage_notes.csv'))

  # psql copy meta command hangs when importing fully combined needs file
  food_needs.tocsv(join(output_dir, 'food_needs.csv'))
  callback_needs.tocsv(join(output_dir, 'callback_needs.csv'))

  etl.cat(prescription_needs,
          mental_wellbeing_needs,
          financial_needs,
          other_needs) \
     .tocsv(join(output_dir, 'remaining_needs.csv'))
Beispiel #30
0
# etl script for marvel character data
# input is a csv file
# output is a json file (to be loaded into a db)
# by ian macfarlane, 2017

import petl as etl
from dateutil.parser import parse
from Character import Character

# load csv data into petl tables
hero_table = etl.fromcsv('marvel_heroes.csv')
villain_table = etl.fromcsv('marvel_villains.csv')

# combine hero and villain tables and join with stats
character_table = etl.cat(hero_table, villain_table)

new_header_names = { 
	'IDENTITY': 'identity',
	'ALIGN': 'alignment',
	'EYE': 'eye',
	'HAIR': 'hair',
	'SEX': 'sex',
	'ALIVE': 'is-alive',
	'FIRST APPEARANCE': 'first-appearance',
	'AT LARGE': 'is-at-large',
	'Year': 'year' }

def first_word(s):
	return s.split(' ', 1)[0].lower()

# standard format is "%Y-%m-%dT%H:%M:%S.%fZ"
Beispiel #31
0
          ['E', 12]]
table2 = etl.cutout(table1, 'bar')
table2


# cat()
#######

import petl as etl
table1 = [['foo', 'bar'],
          [1, 'A'],
          [2, 'B']]
table2 = [['bar', 'baz'],
          ['C', True],
          ['D', False]]
table3 = etl.cat(table1, table2)
table3
# can also be used to square up a single table with uneven rows
table4 = [['foo', 'bar', 'baz'],
          ['A', 1, 2],
          ['B', '2', '3.4'],
          [u'B', u'3', u'7.8', True],
          ['D', 'xyz', 9.0],
          ['E', None]]
table5 = etl.cat(table4)
table5
# use the header keyword argument to specify a fixed set of fields
table6 = [['bar', 'foo'],
          ['A', 1],
          ['B', 2]]
table7 = etl.cat(table6, header=['A', 'foo', 'B', 'bar', 'C'])
print("Relating condos to parcels...")
joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \
    .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True)
print("joined rowcount: ", etl.nrows(joined))
if DEV:
    print(etl.look(joined))

# Calculate errors
print("Calculating errors...")
unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id')
print("unjoined rowcount: ", etl.nrows(unjoined))
dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg')
print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched))
if DEV:
    print(etl.look(dor_condos_unjoined_unmatched))
dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg')
print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates))
if DEV:
    print(etl.look(dor_condos_unjoined_duplicates))
error_table = etl.cat(dor_condos_unjoined_unmatched, dor_condos_unjoined_duplicates)
if DEV:
    print(etl.look(error_table))

# Write to engine db
if not DEV:
    print('Writing condos...')
    joined.todb(pg_db, 'dor_condominium')
    print('Writing errors...')
    error_table.todb(pg_db, 'dor_condominium_error')

print("Completed in ", datetime.now() - start, " minutes.")