def test(self): """ @summary: Test merged/sorted file """ self._log.info('Testing file {}'.format(self.tidyfile)) reccount = 0 reader, outf = get_csv_reader(self.tidyfile, self.delimiter, ENCODING) self._files[self.tidyfile] = outf header = next(reader) if header[self.sort_idx] != 'gbifID': self._log.error('Bad header in {}'.format(self.tidyfile)) currid = 0 for row in reader: reccount += 1 try: gbifid = int(row[self.sort_idx]) except: self._log.error('Bad gbifID on rec {}'.format(reader.line_num)) else: if gbifid < currid: self._log.error('Bad sort gbifID {} on rec {}'.format( gbifid, reader.line_num)) break elif gbifid == currid: self._log.error('Duplicate gbifID {} on rec {}'.format( gbifid, reader.line_num)) else: currid = gbifid self._log.info('File contained {} records'.format(reccount)) self.closeOne(self.tidyfile)
def write_group_files(self): """ @summary: Split large file into multiple files, each containing a header and records of a single group value. @note: The number of group files must be small enough for the system to have them all open at the same time. @note: Use "gather" to evaluate the dataset first. """ try: reader, inf = get_csv_reader(self.messyfile, self.delimiter, ENCODING) header = next(reader) # {groupval: csvwriter} groupfiles = {} for row in reader: try: grpval = row[self.sort_idx] except Exception as e: self._log.warn( 'Failed to get column {} from record {}'.format( self.sort_idx, reader.line_num)) else: try: wtr = groupfiles[grpval] except: wtr = self._get_group_file(grpval) groupfiles[grpval] = wtr wtr.writerow(header) wtr.writerow(row) except Exception as e: raise finally: inf.close()
def _read_sortvals(self, group_cols): """ @summary: Sort file """ self._log.info('Gathering unique sort values from file {}'.format( self.messyfile)) reader, inf = get_csv_reader(self.messyfile, self.indelimiter, ENCODING) group_idxs = self._get_sortidxs(reader, group_cols) sortvals = set() try: for row in reader: vals = [] for idx in group_idxs: vals.append(row[idx]) sortvals.add(tuple(vals)) except Exception as e: self._log.error('Exception reading infile {}: {}'.format( self.messyfile, e)) finally: inf.close() self._log.info('File contained {} unique sort values'.format( len(sortvals))) return sortvals
def read_lookup(self, fname, prioritized_keyfld_lst, delimiter, ignore_quotes=True): ''' @summary: Read and populate dictionary with key = uuid and val = dictionary of record values ''' no_old_legacy = 0 no_new_legacy = 0 if os.path.exists(fname): if self.valtype == VAL_TYPE.DICT: try: rdr, inf = get_csv_dict_reader( fname, delimiter, self.encoding, ignore_quotes=ignore_quotes) except Exception as e: print('Failed reading data in {}: {}' .format(fname, e)) else: for data in rdr: for keyfld in prioritized_keyfld_lst: datakey = data[keyfld] if datakey: self.lut[datakey] = data break if not datakey: print('No {} for record {}'.format(keyfld, data)) finally: inf.close() print('no_old_legacy {} no_new_legacy (default -9999) {}' .format(no_old_legacy, no_new_legacy)) elif self.valtype == VAL_TYPE.SET: recno = 0 try: rdr, inf = get_csv_reader(fname, delimiter, self.encoding) # get header line, recno = getLine(rdr, recno) # read lookup vals into dictionary while (line is not None): line, recno = getLine(rdr, recno) if line and len(line) > 0: try: # First item is scientificName, rest are taxonKeys self.lut[line[0]] = set(line[1:]) except Exception: print('Failed to parse line {} {}' .format(recno, line)) except Exception as e: print('Failed reading data in {}: {}' .format(fname, e)) finally: inf.close()
def gather_groupvals(self, fname): """ @summary: Split original data file with chunks of sorted data into multiple sorted files. @note: Replicate the original header on each smaller sorted file """ try: reader, inf = get_csv_reader(self.messyfile, self.delimiter, ENCODING) header = next(reader) groups = {} grpval = None grpcount = 0 for row in reader: try: currval = row[self.sort_idx] except Exception as e: self._log.warn( 'Failed to get column {} from record {}'.format( self.sort_idx, reader.line_num)) else: if grpval is None: grpval = currval if currval != grpval: self._log.info( 'Start new group {} on record {}'.format( currval, reader.line_num)) try: groups[grpval] += grpcount except: groups[grpval] = grpcount grpcount = 1 grpval = currval else: grpcount += 1 except Exception as e: pass finally: inf.close() try: writer, outf = get_csv_writer(fname, self.delimiter, ENCODING) writer.writerow(['groupvalue', 'count']) for grpval, grpcount in groups.items(): writer.writerow([grpval, grpcount]) except Exception as e: pass finally: outf.close()
def split_sorted(self): """ @summary: Split original data file with chunks of sorted data into multiple sorted files. @note: Replicate the original header on each smaller sorted file """ reader, inf = get_csv_reader(self.messyfile, self.delimiter, ENCODING) self._files[self.messyfile] = inf header = next(reader) splitIdx = 0 splitname = '{}_{}.csv'.format(self.splitBase, splitIdx) writer, outf = get_csv_writer(splitname, self.delimiter, ENCODING) self._files[splitname] = outf writer.writerow(header) currid = -1 for row in reader: currid += 1 try: gbifid = int(row[self.sort_idx]) except Exception: self._log.warn( 'First column {} is not an integer on record {}'.format( row[self.sort_idx], reader.line_num)) else: if gbifid >= currid: writer.writerow(row) else: self._log.info('Start new chunk on record {}'.format( reader.line_num)) # close this chunk and start new writer, splitname, splitIdx = \ self._switchOutput(splitname, self.splitBase, splitIdx) writer.writerow(header) writer.writerow(row) currid = gbifid self.closeOne(self.messyfile)
def _getSplitReadersFirstRecs(self): """ @summary: Find, open, and get CSVReaders for all split files. """ rdrRecs = {} idx = 0 splitname = '{}_{}.csv'.format(self.splitBase, idx) while os.path.exists(splitname): reader, outf = get_csv_reader(splitname, self.delimiter, ENCODING) self._files[splitname] = outf row = next(reader) # If header is present, first field will not be an integer, # so move to the next record try: int(row[self.sort_idx]) except: row = next(reader) rdrRecs[splitname] = (reader, row) # increment file idx += 1 splitname = '{}_{}.csv'.format(self.splitBase, idx) return rdrRecs
def _getHeader(self): reader, inf = get_csv_reader(self.messyfile, self.delimiter) header = next(reader) inf.close() return header
def _get_header(self): reader, inf = get_csv_reader(self.messyfile, self.indelimiter, ENCODING) header = next(reader) inf.close() return header