Example #1
0
 def _open_group_file(self, grpval, out_delimiter):
     basefname = '{}_{}.csv'.format(self._dataname, grpval)
     grp_fname = os.path.join(self._basepath, basefname)
     writer, outf = get_csv_writer(grp_fname, out_delimiter, ENCODING)
     writer.writerow(self.header)
     self._files[grp_fname] = outf
     return writer
Example #2
0
    def write_lookup(self, fname, header, delimiter):
        # Write scientific names and taxonKeys found with them in raw data
        fmode = 'w'
        if os.path.exists(fname):
            fmode = 'a'
        try:
            if self.valtype == VAL_TYPE.DICT:
                # Write all vals in dict, assumes each dictionary-value has the same keys
                if header is None:
                    header = self._get_field_names()
                    writer, outf = get_csv_dict_writer(
                        fname, delimiter, self.encoding, header, fmode=fmode)
                    if fmode == 'w':
                        writer.writeheader()
                    for key, ddict in self.lut.items():
                        writer.writerow(ddict)
                # Write values from dict for header fields, insert '' when missing
                else:
                    writer, outf = get_csv_writer(fname, delimiter, self.encoding,
                                                fmode=fmode)
                    writer.writerow(header)
                    for key, rec in self.lut.items():
                        row = makerow(rec, header)
                        writer.writerow(row)

            # Non-dictionary lookup
            else:
                writer, outf = get_csv_writer(fname, delimiter, self.encoding,
                                            fmode=fmode)
                if fmode == 'w' and header is not None:
                    writer.writerow(header)
                if self.valtype in (VAL_TYPE.SET, VAL_TYPE.TUPLE):
                    for key, val in self.lut.items():
                        row = [k for k in val]
                        row.insert(0, key)
                        writer.writerow(row)
        except Exception as e:
            print('Failed to write data to {}, ({})'.format(fname, e))
        finally:
            outf.close()
Example #3
0
    def _switchOutput(self, currname, basename, idx):
        # close this chunk and start new
        self.closeOne(currname)
        idx += 1
        newname = '{}_{}.csv'.format(basename, idx)
        # Get writer and save open file for later closing
        writer, outf = get_csv_writer(newname,
                                      self.delimiter,
                                      ENCODING,
                                      doAppend=True)
        self._files[newname] = outf

        return writer, newname, idx
Example #4
0
    def merge(self):
        """
        @summary: Merge sorted files into a single larger sorted file.
        """
        rdrRecs = self._getSplitReadersFirstRecs()
        writer, outf = get_csv_writer(self.tidyfile, self.delimiter)
        self._files[self.tidyfile] = outf

        rec = self._getHeader()
        while rec is not None:
            writer.writerow(rec)
            rec = self._getSmallestRec(rdrRecs)
        self.closeOne(self.tidyfile)
Example #5
0
    def gather_groupvals(self, fname):
        """
        @summary: Split original data file with chunks of sorted data into
                  multiple sorted files.
        @note: Replicate the original header on each smaller sorted file
        """
        try:
            reader, inf = get_csv_reader(self.messyfile, self.delimiter,
                                         ENCODING)
            header = next(reader)
            groups = {}

            grpval = None
            grpcount = 0
            for row in reader:
                try:
                    currval = row[self.sort_idx]
                except Exception as e:
                    self._log.warn(
                        'Failed to get column {} from record {}'.format(
                            self.sort_idx, reader.line_num))
                else:
                    if grpval is None:
                        grpval = currval
                    if currval != grpval:
                        self._log.info(
                            'Start new group {} on record {}'.format(
                                currval, reader.line_num))
                        try:
                            groups[grpval] += grpcount
                        except:
                            groups[grpval] = grpcount
                        grpcount = 1
                        grpval = currval
                    else:
                        grpcount += 1
        except Exception as e:
            pass
        finally:
            inf.close()

        try:
            writer, outf = get_csv_writer(fname, self.delimiter, ENCODING)
            writer.writerow(['groupvalue', 'count'])
            for grpval, grpcount in groups.items():
                writer.writerow([grpval, grpcount])
        except Exception as e:
            pass
        finally:
            outf.close()
Example #6
0
 def _get_provider_file(self, resource_id, resource_url, unique_providers):
     """
     @summary: Sort file
     """
     try:
         writer, outf = unique_providers[(resource_id, resource_url)]
     except:
         outfname = os.path.join(self.pth,
                                 resource_id.replace(',', '_') + '.csv')
         if os.path.exists(outfname):
             fmode = 'a'
         else:
             fmode = 'w'
         writer, outf = get_csv_writer(outfname, self.delimiter, ENCODING,
                                       fmode)
         self._files[outfname] = outf
     return writer
Example #7
0
    def fix_bison_data(self, infile, outfile, resource_key, resource_pvals):
        if not os.path.exists(infile):
            raise Exception('File {} does not exist'.format(infile))

        action = resource_pvals['action']
        new_res_id = resource_pvals['resource_id']
        const_res_name = resource_pvals['resource_name']
        const_res_url = resource_pvals['resource_url']
        if not const_res_name:
            raise Exception('{} must have resource_name {}'.format(
                resource_key, new_res_id, const_res_name))

        if action in PROVIDER_ACTIONS:
            # Step 1: rewrite with updated resource/provider values
            self.loginfo("""{} for ticket {},
                infile {} to outfile {}
                with name {}, id {}""".format(action, resource_key, infile,
                                              outfile, const_res_name,
                                              new_res_id))

            dl_fields = list(BISON2020_FIELD_DEF.keys())
            try:
                # Open incomplete BISON CSV file as input
                dict_reader, inf = get_csv_dict_reader(infile, BISON_DELIMITER,
                                                       ENCODING)
                header = next(dict_reader)
                csv_writer, outf = get_csv_writer(outfile, BISON_DELIMITER,
                                                  ENCODING)
                csv_writer.writerow(header)
                recno = 0
                for rec in dict_reader:
                    recno += 1
                    self._remove_internal_delimiters(rec)

                    row = makerow(rec, dl_fields)
                    csv_writer.writerow(row)
            except:
                raise
            finally:
                inf.close()
                outf.close()
        else:
            self.loginfo('Unknown action {} for input {}, ({})'.format(
                action, const_res_name, resource_key))
Example #8
0
    def split_sorted(self):
        """
        @summary: Split original data file with chunks of sorted data into
                  multiple sorted files.
        @note: Replicate the original header on each smaller sorted file
        """
        reader, inf = get_csv_reader(self.messyfile, self.delimiter, ENCODING)
        self._files[self.messyfile] = inf
        header = next(reader)

        splitIdx = 0
        splitname = '{}_{}.csv'.format(self.splitBase, splitIdx)
        writer, outf = get_csv_writer(splitname, self.delimiter, ENCODING)
        self._files[splitname] = outf
        writer.writerow(header)

        currid = -1
        for row in reader:
            currid += 1
            try:
                gbifid = int(row[self.sort_idx])
            except Exception:
                self._log.warn(
                    'First column {} is not an integer on record {}'.format(
                        row[self.sort_idx], reader.line_num))
            else:
                if gbifid >= currid:
                    writer.writerow(row)
                else:
                    self._log.info('Start new chunk on record {}'.format(
                        reader.line_num))
                    # close this chunk and start new
                    writer, splitname, splitIdx = \
                            self._switchOutput(splitname, self.splitBase, splitIdx)
                    writer.writerow(header)
                    writer.writerow(row)
                currid = gbifid
        self.closeOne(self.messyfile)
Example #9
0
 def write_resolved_taxkeys(self, lut_fname, name_fails, nametaxa):
     """
     @summary: Create lookup table for:
               BISON canonicalName from GBIF scientificName and/or taxonKey
     """
     csvwriter, f = get_csv_writer(lut_fname,
                                   BISON_DELIMITER,
                                   ENCODING,
                                   fmode='a')
     count = 0
     tax_resolved = []
     gbifapi = GbifAPI()
     try:
         for badname in name_fails:
             taxonkeys = nametaxa[badname]
             for tk in taxonkeys:
                 canonical = gbifapi.find_canonical(taxkey=tk)
                 if canonical is not None:
                     count += 1
                     csvwriter.writerow([tk, canonical])
                     self._log.info(
                         'Appended {} taxonKey/clean_provided_scientific_name to {}'
                         .format(count, lut_fname))
                     tax_resolved.append(badname)
                     break
     except Exception as e:
         pass
     finally:
         f.close()
     self._log.info(
         'Wrote {} taxkey/canonical pairs ({} failed) to {}'.format(
             len(tax_resolved),
             len(name_fails) - len(tax_resolved), lut_fname))
     for tres in tax_resolved:
         name_fails.remove(tres)
     return name_fails
Example #10
0
 def _get_group_file(self, grpval):
     basefname = '{}_{}.csv'.format(self._dataname, grpval)
     grp_fname = os.path.join(self._basepath, basefname)
     writer, outf = get_csv_writer(grp_fname, self.delimiter, ENCODING)
     self._files[grp_fname] = outf
     return writer