Beispiel #1
0
    def test(self):
        """
        @summary: Test merged/sorted file
        """
        self._log.info('Testing file {}'.format(self.tidyfile))
        reccount = 0
        reader, outf = get_csv_reader(self.tidyfile, self.delimiter, ENCODING)
        self._files[self.tidyfile] = outf
        header = next(reader)
        if header[self.sort_idx] != 'gbifID':
            self._log.error('Bad header in {}'.format(self.tidyfile))

        currid = 0
        for row in reader:
            reccount += 1
            try:
                gbifid = int(row[self.sort_idx])
            except:
                self._log.error('Bad gbifID on rec {}'.format(reader.line_num))
            else:
                if gbifid < currid:
                    self._log.error('Bad sort gbifID {} on rec {}'.format(
                        gbifid, reader.line_num))
                    break
                elif gbifid == currid:
                    self._log.error('Duplicate gbifID {} on rec {}'.format(
                        gbifid, reader.line_num))
                else:
                    currid = gbifid

        self._log.info('File contained {} records'.format(reccount))
        self.closeOne(self.tidyfile)
Beispiel #2
0
    def write_group_files(self):
        """
        @summary: Split large file into multiple files, each containing a header
                  and records of a single group value.
        @note: The number of group files must be small enough for the system to
               have them all open at the same time.
        @note: Use "gather" to evaluate the dataset first.
        """
        try:
            reader, inf = get_csv_reader(self.messyfile, self.delimiter,
                                         ENCODING)
            header = next(reader)
            # {groupval: csvwriter}
            groupfiles = {}
            for row in reader:
                try:
                    grpval = row[self.sort_idx]
                except Exception as e:
                    self._log.warn(
                        'Failed to get column {} from record {}'.format(
                            self.sort_idx, reader.line_num))
                else:
                    try:
                        wtr = groupfiles[grpval]
                    except:
                        wtr = self._get_group_file(grpval)
                        groupfiles[grpval] = wtr
                        wtr.writerow(header)

                    wtr.writerow(row)
        except Exception as e:
            raise
        finally:
            inf.close()
Beispiel #3
0
    def _read_sortvals(self, group_cols):
        """
        @summary: Sort file
        """
        self._log.info('Gathering unique sort values from file {}'.format(
            self.messyfile))
        reader, inf = get_csv_reader(self.messyfile, self.indelimiter,
                                     ENCODING)

        group_idxs = self._get_sortidxs(reader, group_cols)
        sortvals = set()
        try:
            for row in reader:
                vals = []
                for idx in group_idxs:
                    vals.append(row[idx])
                sortvals.add(tuple(vals))
        except Exception as e:
            self._log.error('Exception reading infile {}: {}'.format(
                self.messyfile, e))
        finally:
            inf.close()
        self._log.info('File contained {} unique sort values'.format(
            len(sortvals)))
        return sortvals
Beispiel #4
0
    def read_lookup(self, fname, prioritized_keyfld_lst, delimiter, ignore_quotes=True):
        '''
        @summary: Read and populate dictionary with key = uuid and
                  val = dictionary of record values
        '''
        no_old_legacy = 0
        no_new_legacy = 0
        if os.path.exists(fname):
            if self.valtype == VAL_TYPE.DICT:
                try:
                    rdr, inf = get_csv_dict_reader(
                        fname, delimiter, self.encoding,
                        ignore_quotes=ignore_quotes)
                except Exception as e:
                    print('Failed reading data in {}: {}'
                                    .format(fname, e))
                else:
                    for data in rdr:
                        for keyfld in prioritized_keyfld_lst:
                            datakey = data[keyfld]
                            if datakey:
                                self.lut[datakey] = data
                                break
                        if not datakey:
                            print('No {} for record {}'.format(keyfld, data))
                finally:
                    inf.close()
                print('no_old_legacy {}  no_new_legacy (default -9999) {}'
                      .format(no_old_legacy, no_new_legacy))

            elif self.valtype == VAL_TYPE.SET:
                recno = 0
                try:
                    rdr, inf = get_csv_reader(fname, delimiter, self.encoding)
                    # get header
                    line, recno = getLine(rdr, recno)
                    # read lookup vals into dictionary
                    while (line is not None):
                        line, recno = getLine(rdr, recno)
                        if line and len(line) > 0:
                            try:
                                # First item is scientificName, rest are taxonKeys
                                self.lut[line[0]] = set(line[1:])
                            except Exception:
                                print('Failed to parse line {} {}'
                                               .format(recno, line))
                except Exception as e:
                    print('Failed reading data in {}: {}'
                                    .format(fname, e))
                finally:
                    inf.close()
Beispiel #5
0
    def gather_groupvals(self, fname):
        """
        @summary: Split original data file with chunks of sorted data into
                  multiple sorted files.
        @note: Replicate the original header on each smaller sorted file
        """
        try:
            reader, inf = get_csv_reader(self.messyfile, self.delimiter,
                                         ENCODING)
            header = next(reader)
            groups = {}

            grpval = None
            grpcount = 0
            for row in reader:
                try:
                    currval = row[self.sort_idx]
                except Exception as e:
                    self._log.warn(
                        'Failed to get column {} from record {}'.format(
                            self.sort_idx, reader.line_num))
                else:
                    if grpval is None:
                        grpval = currval
                    if currval != grpval:
                        self._log.info(
                            'Start new group {} on record {}'.format(
                                currval, reader.line_num))
                        try:
                            groups[grpval] += grpcount
                        except:
                            groups[grpval] = grpcount
                        grpcount = 1
                        grpval = currval
                    else:
                        grpcount += 1
        except Exception as e:
            pass
        finally:
            inf.close()

        try:
            writer, outf = get_csv_writer(fname, self.delimiter, ENCODING)
            writer.writerow(['groupvalue', 'count'])
            for grpval, grpcount in groups.items():
                writer.writerow([grpval, grpcount])
        except Exception as e:
            pass
        finally:
            outf.close()
Beispiel #6
0
    def split_sorted(self):
        """
        @summary: Split original data file with chunks of sorted data into
                  multiple sorted files.
        @note: Replicate the original header on each smaller sorted file
        """
        reader, inf = get_csv_reader(self.messyfile, self.delimiter, ENCODING)
        self._files[self.messyfile] = inf
        header = next(reader)

        splitIdx = 0
        splitname = '{}_{}.csv'.format(self.splitBase, splitIdx)
        writer, outf = get_csv_writer(splitname, self.delimiter, ENCODING)
        self._files[splitname] = outf
        writer.writerow(header)

        currid = -1
        for row in reader:
            currid += 1
            try:
                gbifid = int(row[self.sort_idx])
            except Exception:
                self._log.warn(
                    'First column {} is not an integer on record {}'.format(
                        row[self.sort_idx], reader.line_num))
            else:
                if gbifid >= currid:
                    writer.writerow(row)
                else:
                    self._log.info('Start new chunk on record {}'.format(
                        reader.line_num))
                    # close this chunk and start new
                    writer, splitname, splitIdx = \
                            self._switchOutput(splitname, self.splitBase, splitIdx)
                    writer.writerow(header)
                    writer.writerow(row)
                currid = gbifid
        self.closeOne(self.messyfile)
Beispiel #7
0
    def _getSplitReadersFirstRecs(self):
        """
        @summary: Find, open, and get CSVReaders for all split files.
        """
        rdrRecs = {}
        idx = 0
        splitname = '{}_{}.csv'.format(self.splitBase, idx)
        while os.path.exists(splitname):
            reader, outf = get_csv_reader(splitname, self.delimiter, ENCODING)
            self._files[splitname] = outf
            row = next(reader)
            # If header is present, first field will not be an integer,
            # so move to the next record
            try:
                int(row[self.sort_idx])
            except:
                row = next(reader)

            rdrRecs[splitname] = (reader, row)
            # increment file
            idx += 1
            splitname = '{}_{}.csv'.format(self.splitBase, idx)
        return rdrRecs
Beispiel #8
0
 def _getHeader(self):
     reader, inf = get_csv_reader(self.messyfile, self.delimiter)
     header = next(reader)
     inf.close()
     return header
Beispiel #9
0
 def _get_header(self):
     reader, inf = get_csv_reader(self.messyfile, self.indelimiter,
                                  ENCODING)
     header = next(reader)
     inf.close()
     return header