def _open_group_file(self, grpval, out_delimiter): basefname = '{}_{}.csv'.format(self._dataname, grpval) grp_fname = os.path.join(self._basepath, basefname) writer, outf = get_csv_writer(grp_fname, out_delimiter, ENCODING) writer.writerow(self.header) self._files[grp_fname] = outf return writer
def write_lookup(self, fname, header, delimiter): # Write scientific names and taxonKeys found with them in raw data fmode = 'w' if os.path.exists(fname): fmode = 'a' try: if self.valtype == VAL_TYPE.DICT: # Write all vals in dict, assumes each dictionary-value has the same keys if header is None: header = self._get_field_names() writer, outf = get_csv_dict_writer( fname, delimiter, self.encoding, header, fmode=fmode) if fmode == 'w': writer.writeheader() for key, ddict in self.lut.items(): writer.writerow(ddict) # Write values from dict for header fields, insert '' when missing else: writer, outf = get_csv_writer(fname, delimiter, self.encoding, fmode=fmode) writer.writerow(header) for key, rec in self.lut.items(): row = makerow(rec, header) writer.writerow(row) # Non-dictionary lookup else: writer, outf = get_csv_writer(fname, delimiter, self.encoding, fmode=fmode) if fmode == 'w' and header is not None: writer.writerow(header) if self.valtype in (VAL_TYPE.SET, VAL_TYPE.TUPLE): for key, val in self.lut.items(): row = [k for k in val] row.insert(0, key) writer.writerow(row) except Exception as e: print('Failed to write data to {}, ({})'.format(fname, e)) finally: outf.close()
def _switchOutput(self, currname, basename, idx): # close this chunk and start new self.closeOne(currname) idx += 1 newname = '{}_{}.csv'.format(basename, idx) # Get writer and save open file for later closing writer, outf = get_csv_writer(newname, self.delimiter, ENCODING, doAppend=True) self._files[newname] = outf return writer, newname, idx
def merge(self): """ @summary: Merge sorted files into a single larger sorted file. """ rdrRecs = self._getSplitReadersFirstRecs() writer, outf = get_csv_writer(self.tidyfile, self.delimiter) self._files[self.tidyfile] = outf rec = self._getHeader() while rec is not None: writer.writerow(rec) rec = self._getSmallestRec(rdrRecs) self.closeOne(self.tidyfile)
def gather_groupvals(self, fname): """ @summary: Split original data file with chunks of sorted data into multiple sorted files. @note: Replicate the original header on each smaller sorted file """ try: reader, inf = get_csv_reader(self.messyfile, self.delimiter, ENCODING) header = next(reader) groups = {} grpval = None grpcount = 0 for row in reader: try: currval = row[self.sort_idx] except Exception as e: self._log.warn( 'Failed to get column {} from record {}'.format( self.sort_idx, reader.line_num)) else: if grpval is None: grpval = currval if currval != grpval: self._log.info( 'Start new group {} on record {}'.format( currval, reader.line_num)) try: groups[grpval] += grpcount except: groups[grpval] = grpcount grpcount = 1 grpval = currval else: grpcount += 1 except Exception as e: pass finally: inf.close() try: writer, outf = get_csv_writer(fname, self.delimiter, ENCODING) writer.writerow(['groupvalue', 'count']) for grpval, grpcount in groups.items(): writer.writerow([grpval, grpcount]) except Exception as e: pass finally: outf.close()
def _get_provider_file(self, resource_id, resource_url, unique_providers): """ @summary: Sort file """ try: writer, outf = unique_providers[(resource_id, resource_url)] except: outfname = os.path.join(self.pth, resource_id.replace(',', '_') + '.csv') if os.path.exists(outfname): fmode = 'a' else: fmode = 'w' writer, outf = get_csv_writer(outfname, self.delimiter, ENCODING, fmode) self._files[outfname] = outf return writer
def fix_bison_data(self, infile, outfile, resource_key, resource_pvals): if not os.path.exists(infile): raise Exception('File {} does not exist'.format(infile)) action = resource_pvals['action'] new_res_id = resource_pvals['resource_id'] const_res_name = resource_pvals['resource_name'] const_res_url = resource_pvals['resource_url'] if not const_res_name: raise Exception('{} must have resource_name {}'.format( resource_key, new_res_id, const_res_name)) if action in PROVIDER_ACTIONS: # Step 1: rewrite with updated resource/provider values self.loginfo("""{} for ticket {}, infile {} to outfile {} with name {}, id {}""".format(action, resource_key, infile, outfile, const_res_name, new_res_id)) dl_fields = list(BISON2020_FIELD_DEF.keys()) try: # Open incomplete BISON CSV file as input dict_reader, inf = get_csv_dict_reader(infile, BISON_DELIMITER, ENCODING) header = next(dict_reader) csv_writer, outf = get_csv_writer(outfile, BISON_DELIMITER, ENCODING) csv_writer.writerow(header) recno = 0 for rec in dict_reader: recno += 1 self._remove_internal_delimiters(rec) row = makerow(rec, dl_fields) csv_writer.writerow(row) except: raise finally: inf.close() outf.close() else: self.loginfo('Unknown action {} for input {}, ({})'.format( action, const_res_name, resource_key))
def split_sorted(self): """ @summary: Split original data file with chunks of sorted data into multiple sorted files. @note: Replicate the original header on each smaller sorted file """ reader, inf = get_csv_reader(self.messyfile, self.delimiter, ENCODING) self._files[self.messyfile] = inf header = next(reader) splitIdx = 0 splitname = '{}_{}.csv'.format(self.splitBase, splitIdx) writer, outf = get_csv_writer(splitname, self.delimiter, ENCODING) self._files[splitname] = outf writer.writerow(header) currid = -1 for row in reader: currid += 1 try: gbifid = int(row[self.sort_idx]) except Exception: self._log.warn( 'First column {} is not an integer on record {}'.format( row[self.sort_idx], reader.line_num)) else: if gbifid >= currid: writer.writerow(row) else: self._log.info('Start new chunk on record {}'.format( reader.line_num)) # close this chunk and start new writer, splitname, splitIdx = \ self._switchOutput(splitname, self.splitBase, splitIdx) writer.writerow(header) writer.writerow(row) currid = gbifid self.closeOne(self.messyfile)
def write_resolved_taxkeys(self, lut_fname, name_fails, nametaxa): """ @summary: Create lookup table for: BISON canonicalName from GBIF scientificName and/or taxonKey """ csvwriter, f = get_csv_writer(lut_fname, BISON_DELIMITER, ENCODING, fmode='a') count = 0 tax_resolved = [] gbifapi = GbifAPI() try: for badname in name_fails: taxonkeys = nametaxa[badname] for tk in taxonkeys: canonical = gbifapi.find_canonical(taxkey=tk) if canonical is not None: count += 1 csvwriter.writerow([tk, canonical]) self._log.info( 'Appended {} taxonKey/clean_provided_scientific_name to {}' .format(count, lut_fname)) tax_resolved.append(badname) break except Exception as e: pass finally: f.close() self._log.info( 'Wrote {} taxkey/canonical pairs ({} failed) to {}'.format( len(tax_resolved), len(name_fails) - len(tax_resolved), lut_fname)) for tres in tax_resolved: name_fails.remove(tres) return name_fails
def _get_group_file(self, grpval): basefname = '{}_{}.csv'.format(self._dataname, grpval) grp_fname = os.path.join(self._basepath, basefname) writer, outf = get_csv_writer(grp_fname, self.delimiter, ENCODING) self._files[grp_fname] = outf return writer