Ejemplo n.º 1
0
 def _validate_(self, level='min'):
     if level == 'min':
         nrows = 8
     elif level == 'max':
         nrows = None
     else:
         raise ValueError('Nonstandard level value')
     try:
         frame = pd.read_table(self.path,
                               index_col=(0, 1),
                               sep='\t',
                               nrows=nrows)
     except IndexError:
         raise ValidationError('Only one column in pairwise file')
     if frame.empty:
         raise ValidationError('No data in file')
     if level == 'max':
         # validate
         all_features = set([
             feature for feature_pair in frame.index
             for feature in feature_pair
         ])
         all_feature_pairs = set(
             frozenset(pair) for pair in combinations(all_features, 2))
         all_found_pairs = [frozenset(pair) for pair in frame.index]
         if len(all_found_pairs) != len(set(all_found_pairs)):
             raise ValidationError('pair is repeated')
         if all_feature_pairs != set(all_found_pairs):
             raise ValidationError(
                 'Not all features pairs present in table')
Ejemplo n.º 2
0
    def _validate_(self, level):
        # doi.org/10.1371/journal.pone.0031009
        keys_found = set()

        # Can't self.open(mode='rb'), so we defer to the backing pathlib object
        with self.path.open(mode='rb') as fh:
            root_element = None
            for prefix, event, value in ijson.parse(fh):
                if root_element is None:
                    if event != 'start_map':
                        raise ValidationError('Root element of file must be a '
                                              'JSON object')
                    else:
                        root_element = True

                # Skip parsing attributes that could be prohibitively large
                if prefix.startswith('placements') \
                        or prefix.startswith('tree'):
                    continue

                # Restricted to only checking root-level keys
                if event == 'map_key' and prefix == '':
                    keys_found.add(value)

        if keys_found != self.fields:
            raise ValidationError('Expected the following fields: %s, found '
                                  '%s.' %
                                  (sorted(self.fields), sorted(keys_found)))
Ejemplo n.º 3
0
    def _validate_(self, level):
        with self.open() as fh:
            header, records_seen, is_min = None, 0, level == 'min'
            fh_ = csv.reader(fh, delimiter='\t')
            file_ = enumerate(fh_, 1) if is_min else zip(range(1, 11), fh_)
            for i, cells in file_:
                if header is None:
                    if len(cells) < 2:
                        raise ValidationError(
                            'Found header on line %d with the following '
                            'columns: %s (length: %d), expected at least 2 '
                            'columns.' % (i, cells, len(cells)))
                    else:
                        header = cells
                else:
                    if len(cells) != len(header):
                        raise ValidationError(
                            'Line %d has %s cells (%s), expected %s.' %
                            (i, len(cells), cells, len(header)))

                    records_seen += 1

            if records_seen == 0:
                raise ValidationError('No records found in file, only '
                                      'observed comments, blank lines, and/or '
                                      'a header row.')
Ejemplo n.º 4
0
    def _validate_(self, level):
        with self.open() as fh:
            header, records_seen, is_min = None, 0, level == 'min'
            fh_ = csv.reader(fh, delimiter='\t')
            file_ = enumerate(fh_, 1) if is_min else zip(range(1, 11), fh_)
            for i, cells in file_:
                if header is None:
                    if len(cells) < 2:
                        raise ValidationError(
                            'Found header on line %d with the following '
                            'columns: %s (length: %d), expected at least 2 '
                            'columns.' % (i, cells, len(cells)))
                    else:
                        header = cells
                else:
                    if len(cells) != len(header):
                        raise ValidationError(
                            'Line %d has %s cells (%s), expected %s.' %
                            (i, len(cells), cells, len(header)))

                    records_seen += 1

            # The first non-comment and non-blank row observed will always be
            # the header row, and since we have no requirement on the field
            # names (because they are dynamically defined), so no need to check
            # for the presence (or validity) of a header row at this point.

            if records_seen == 0:
                raise ValidationError('No records found in file, only '
                                      'observed comments, blank lines, and/or '
                                      'a header row.')
Ejemplo n.º 5
0
    def _validate(self, n_records=None):
        with self.open() as fh:
            line = fh.readline().rstrip()
            if line not in ['#SampleID\tDifference', '#SampleID\tDistance']:
                raise ValidationError(
                    "Header line must be TSV with column names '#SampleID' "
                    "and either 'Difference' or 'Distance'. Found the "
                    "following header:\n\n{0!r}".format(line))

            has_data = False
            for line_number, line in enumerate(fh, start=2):
                cells = line.strip().split('\t')
                if len(cells) != 2:
                    # TODO indicate tab separated
                    raise ValidationError(
                        "Expected data record to be TSV with two fields, "
                        "detected {0} fields at line {1}:\n\n{2!r}".format(
                            len(cells), line_number, cells))
                try:
                    float(cells[1])
                except ValueError:
                    raise ValidationError(
                        "Second column must contain only numeric values. "
                        "A non-numeric value ({0!r}) was detected at line "
                        "{1}.".format(cells[1], line_number))

                has_data = True
                if n_records is not None and (line_number - 1) >= n_records:
                    break

            if not has_data:
                raise ValidationError(
                    "There must be at least one data record present in the "
                    "file in addition to the header line.")
Ejemplo n.º 6
0
    def _validate_(self, level):
        with self.open() as fh:
            if level == 'min':
                # Up to 20 lines
                file_ = zip(range(1, 21), fh)
            else:  # level == 'max'
                # All lines
                file_ = enumerate(fh, start=1)

            ids = set()
            for line_num, line in file_:
                try:
                    id, count = line.rstrip('\n').split('\t')
                except ValueError:
                    raise ValidationError(
                        "Invalid format on line %d. Each line must consist "
                        "of a reference sequence ID and its count separated "
                        "by a tab character." % line_num)

                if id in ids:
                    raise ValidationError(
                        "Encountered duplicate reference sequence ID on line "
                        "%d: %s" % (line_num, id))
                else:
                    ids.add(id)

                try:
                    int(count)
                except ValueError:
                    raise ValidationError(
                        "Line %d does not contain an integer as its second "
                        "field: %s" % (line_num, count))
Ejemplo n.º 7
0
    def _validate(self, n_records=None):
        with self.open() as fh:
            # validate header
            # for now we will not validate any information in the header,
            # since column names, count etc are frequently unique to individual
            # estimators. Let's keep this flexible.
            line = fh.readline()

            # validate body
            has_data = False
            for line_number, line in enumerate(fh, start=2):
                # we want to strip each cell, not the original line
                # otherwise empty cells are dropped, causing a TypeError
                cells = [c.strip() for c in line.split('\t')]
                if len(cells) < 2:
                    raise ValidationError(
                        "Expected data record to be TSV with two or more "
                        "fields. Detected {0} fields at line {1}:\n\n{2!r}".
                        format(len(cells), line_number, cells))
                # all values (except row name) should be numbers
                try:
                    [float(c) for c in cells[1:]]
                except ValueError:
                    raise ValidationError(
                        "Columns must contain only numeric values. "
                        "A non-numeric value ({0!r}) was detected at line "
                        "{1}.".format(cells[1], line_number))

                has_data = True
                if n_records is not None and (line_number - 1) >= n_records:
                    break

            _validate_file_not_empty(has_data)
Ejemplo n.º 8
0
 def _validate_(self, level):
     with self.open() as fh:
         try:
             int(fh.readline().rstrip('\n'))
         except (TypeError, ValueError):
             raise ValidationError("File does not contain an integer")
         if fh.readline():
             raise ValidationError("Too many lines in file.")
Ejemplo n.º 9
0
    def _validate_(self, level):
        line = open(str(self)).readline()
        if len(line.strip()) == 0:
            raise ValidationError("Failed to locate header.")

        header = set(line.strip().split('\t'))
        for column in sorted(self.METADATA_COLUMNS):
            if column not in header:
                raise ValidationError(f"{column} is not a column")
Ejemplo n.º 10
0
    def validate(self, level):
        try:
            md = qiime2.Metadata.load(str(self))
        except qiime2.metadata.MetadataFileError as md_exc:
            raise ValidationError(md_exc) from md_exc

        for column in sorted(self.METADATA_COLUMNS):
            try:
                md.get_column(column)
            except ValueError as md_exc:
                raise ValidationError(md_exc) from md_exc
Ejemplo n.º 11
0
 def validate(self, *args):
     col_set = set(['db-seq', 'seq-name', 'kmer', 'region', 
                   'fwd-primer', 'rev-primer', 'kmer-length'])
     map_ = pd.read_csv(str(self), dtype=str, sep='\t')
     if set(map_.columns) != col_set:
         raise ValidationError('The KmerMap does not contain '
                               'the correct columns')
     try:
         map_['kmer-length'].astype(float)
     except:
         raise ValidationError('The kmer-length column must be numeric')
Ejemplo n.º 12
0
    def validate(self, *args):
        try:
            md = qiime2.Metadata.load(str(self))
        except qiime2.metadata.MetadataFileError as md_exc:
            raise ValidationError(md_exc) from md_exc

        if md.column_count == 0:
            raise ValidationError('Format must contain at least 1 column')

        filtered_md = md.filter_columns(column_type='numeric')
        if filtered_md.column_count != md.column_count:
            raise ValidationError('Must only contain numeric values.')
Ejemplo n.º 13
0
 def _validate_n_ints(self, n):
     with self.open() as fh:
         last_val = None
         for idx, line in enumerate(fh, 1):
             if n is not None and idx >= n:
                 break
             try:
                 val = int(line.rstrip('\n'))
             except (TypeError, ValueError):
                 raise ValidationError("Line %d is not an integer." % idx)
             if last_val is not None and last_val + 3 == val:
                 raise ValidationError("Line %d is 3 more than line %d" %
                                       (idx, idx - 1))
             last_val = val
Ejemplo n.º 14
0
def _validate_line_lengths(seq_len, prev_seq_len, prev_seq_start_line):
    if prev_seq_len != seq_len:
        raise ValidationError('The sequence starting on line '
                              f'{prev_seq_start_line} was length '
                              f'{prev_seq_len}. All previous sequences were '
                              f'length {seq_len}. All sequences must be the '
                              'same length for AlignedDNAFASTAFormat.')
Ejemplo n.º 15
0
    def _validate_(self, level):
        n_records = {'min': 10, 'max': None}[level]
        with self.open() as fh:
            # validate header
            # for now we will not validate any information in the header.
            line = fh.readline()

            # validate body
            has_data = False
            for line_number, line in enumerate(fh, start=2):
                cells = line.strip().split('\t')
                _validate_record_min_len(cells, line_number, 2)
                for cell in cells[1:]:
                    try:
                        float(cell)
                    except ValueError:
                        raise ValidationError(
                            "Expected data to be comprised of float values. "
                            "Found non-float value {0} at line {1}".format(
                                cell, line_number))
                has_data = True
                if n_records is not None and (line_number - 1) >= n_records:
                    break

            _validate_file_not_empty(has_data)
Ejemplo n.º 16
0
 def _validate_(self, level):
     with self.open() as fh:
         for line, idx in zip(fh, range(1, 6)):
             cells = line.rstrip('\n').split('\t')
             if len(cells) != 2:
                 raise ValidationError("Line %d does not have exactly 2 "
                                       "elements seperated by a tab." % idx)
Ejemplo n.º 17
0
    def _validate_(self, level):
        with self.open() as fh:
            if fh.peek(2)[:2] != b'\x1f\x8b':
                raise ValidationError('File is uncompressed')

        record_count_map = {'min': 5, 'max': None}
        self._check_n_records(record_count_map[level])
Ejemplo n.º 18
0
 def validate(self):
     with self.open() as fh:
         for line, idx in zip(fh, range(1, 6)):
             try:
                 int(line.rstrip('\n'))
             except (TypeError, ValueError):
                 raise ValidationError("Line %d is not an integer." % idx)
Ejemplo n.º 19
0
def _validate_is_numeric(inputvalue, valuedescription, line_number):
    try:
        float(inputvalue)
    except ValueError:
        raise ValidationError(
            "{0}must contain only numeric values. A non-numeric value "
            "({1!r}) was detected at line {2}.".format(
                valuedescription, inputvalue, line_number))
Ejemplo n.º 20
0
def validate_ascending_seq(data: list, level):
    # landmine for testing
    if data == [2021, 8, 24]:
        raise KeyError

    prev = float('-inf')
    for number in data:
        if not number > prev:
            raise ValidationError("%s is not greater than %s" % (number, prev))
Ejemplo n.º 21
0
    def _validate_lines(self, max_lines):
        FASTADNAValidator = re.compile(r'[ACGTURYKMSWBDHVN]+\r?\n?')
        ValidationSet = frozenset(('A', 'C', 'G', 'T', 'U', 'R', 'Y', 'K', 'M',
                                   'S', 'W', 'B', 'D', 'H', 'V', 'N'))

        last_line_was_ID = False
        ids = {}

        with open(str(self), 'rb') as fh:
            try:
                first = fh.read(6)
                if first[:3] == b'\xEF\xBB\xBF':
                    first = first[3:]
                # Empty files should validate
                if first.strip() == b'':
                    return
                if first[0] != ord(b'>'):
                    raise ValidationError("First line of file is not a valid "
                                          "description. Descriptions must "
                                          "start with '>'")
                fh.seek(0)
                for line_number, line in enumerate(fh, 1):
                    if line_number >= max_lines:
                        return
                    line = line.decode('utf-8-sig')
                    if line.startswith('>'):
                        if last_line_was_ID:
                            raise ValidationError('Multiple consecutive '
                                                  'descriptions starting on '
                                                  f'line {line_number-1!r}')
                        line = line.split()
                        if line[0] == '>':
                            if len(line) == 1:
                                raise ValidationError(
                                    f'Description on line {line_number} is '
                                    'missing an ID.')
                            else:
                                raise ValidationError(
                                    f'ID on line {line_number} starts with a '
                                    'space. IDs may not start with spaces')
                        if line[0] in ids:
                            raise ValidationError(
                                f'ID on line {line_number} is a duplicate of '
                                f'another ID on line {ids[line[0]]}.')
                        ids[line[0]] = line_number
                        last_line_was_ID = True
                    elif re.fullmatch(FASTADNAValidator, line):
                        last_line_was_ID = False
                    else:
                        for position, character in enumerate(line):
                            if character not in ValidationSet:
                                raise ValidationError(
                                    f"Invalid character '{character}' at "
                                    f"position {position} on line "
                                    f"{line_number} (does not match IUPAC "
                                    "characters for a DNA sequence).")
            except UnicodeDecodeError as e:
                raise ValidationError(f'utf-8 cannot decode byte on line '
                                      f'{line_number}') from e
Ejemplo n.º 22
0
 def _check_n_records(self, n):
     with open(str(self), 'r') as fh:
         for lineno, record in enumerate(fh):
             if lineno == n:
                 break
             else:
                 if len(record.strip()) == 0:
                     raise ValidationError(
                         'Error on line %d, '
                         'empty lines are not permitted.' % (lineno + 1))
Ejemplo n.º 23
0
 def _validate(self, n_records=None):
     with self.open() as fh:
         # check the header column names
         header = fh.readline()
         comp_columns = [i for i, head in enumerate(header.split('\t'))
                         if 'PC' in head]
         # ensure there at least two components
         if len(comp_columns) < 1:
             raise ValidationError('No PC# columns present. '
                                   'There should be at least one PC#'
                                   '(i.e. at minimum PC1) in trajectory.')
         # validate the body of the data
         for line_number, line in enumerate(fh, start=2):
             cells = line.split('\t')
             pc_type = [is_float(cells[c].strip()) for c in comp_columns]
             if not all(pc_type):
                 raise ValidationError('Non float values in trajectory.')
             if n_records is not None and (line_number - 1) >= n_records:
                 break
Ejemplo n.º 24
0
 def _check_n_records(self, n):
     with open(str(self)) as fh:
         csv_reader = csv.reader(fh, delimiter='\t')
         for i, row in enumerate(csv_reader):
             if i == n:
                 break
             else:
                 if len(row) != 18:
                     raise ValidationError(
                         'Incorrect number of fields detected on line %d.'
                         ' Should be exactly 18.' % (i + 1))
Ejemplo n.º 25
0
 def _validate(self, n_records=None):
     with self.open() as fh:
         # check the header column names
         header = fh.readline()
         comp_columns = list(header.split('\t'))[1:]
         # ensure both headers are present
         allowed_ = ['Source', 'Sink', 'Source_one', 'Source_two']
         num_col = sum(
             [str(i).replace('\n', '') in allowed_ for i in comp_columns])
         if num_col != 2:
             raise ValidationError('Source or Sink columns are missing.'
                                   ' Got %s' % ', '.join(comp_columns))
         # validate the body of the data
         for line_number, line in enumerate(fh, start=2):
             cells = line.split('\t')
             values_ = [is_str(cells[c].strip()) for c in [1, 2]]
             if not all(values_):
                 err_ = 'Non string values in source-sink map.'
                 raise ValidationError(err_)
             if n_records is not None and (line_number - 1) >= n_records:
                 break
Ejemplo n.º 26
0
    def _check_n_records(self, root, n=None):
        with self.open() as fh:
            header = None
            records_seen = 0
            file_ = enumerate(fh) if n is None else zip(range(n), fh)
            for i, line in file_:
                i = i + 1  # For easier reporting
                if line.lstrip(' ') == '\n':
                    continue  # Blank line
                elif line.startswith('#'):
                    continue  # Comment line

                cells = [c.strip() for c in line.rstrip('\n').split(',')]
                if header is None:
                    if cells != self.EXPECTED_HEADER:
                        raise ValidationError(
                            'Found header on line %d with the following '
                            'labels: %s, expected: %s'
                            % (i, cells, self.EXPECTED_HEADER))
                    else:
                        header = cells
                else:
                    if len(cells) != len(header):
                        raise ValidationError(
                            'Line %d has %s cells (%s), expected %s.'
                            % (i, len(cells), cells, len(header)))

                    # Structure checks out, so let's make lookup easy
                    cells = dict(zip(header, cells))

                    # TODO: a bunch of tests in this subpackage aren't well
                    # behaved --- many tests fail on this check because the
                    # test data isn't constructed correctly. As well, there
                    # appear to be framework-related issues preventing us from
                    # making this kind of validation work for the relative
                    # manifest formats at this time.
                    if root == '':
                        fp = os.path.join(root, cells[self.PATH_HEADER_LABEL])
                        if not os.path.exists(os.path.expandvars(fp)):
                            raise ValidationError(
                                'File referenced on line %d could not be '
                                'found (%s).'
                                % (i, fp))

                    if cells['direction'] not in ('forward', 'reverse'):
                        raise ValidationError(
                            'Read direction declared on line %d was %s, '
                            'expected `forward` or `reverse`.'
                            % (i, cells['direction']))

                    records_seen += 1

            if header is None:
                raise ValidationError('No header found, expected: %s.'
                                      % self.EXPECTED_HEADER)

            if records_seen == 0:
                raise ValidationError('No sample records found in manifest, '
                                      'only observed comments, blank lines, '
                                      'and/or a header row.')
Ejemplo n.º 27
0
    def _validate_(self, level):
        sigs = [
            'This is RAxML version', 'Base frequencies',
            'Final GAMMA likelihood'
        ]

        info = self.path.read_text()

        for sig in sigs:
            new_sig = sig.replace(r' ', r'\W+')
            if not re.search(new_sig, info):
                raise ValidationError('Missing structured content: "%s".' %
                                      sig)
Ejemplo n.º 28
0
    def _validate_(self, level):
        try:
            md = qiime2.Metadata.load(str(self))
        except qiime2.metadata.MetadataFileError as md_exc:
            raise ValidationError(md_exc) from md_exc

        md = md.filter_columns(column_type='categorical')

        md_cols = dict()
        for column in self.METADATA_COLUMNS.keys():
            try:
                md_cols[column] = md.get_column(column)
            except ValueError as md_exc:
                raise ValidationError(md_exc) from md_exc

        filepaths = dict()
        for column_name, column in md_cols.items():
            column = column.to_series()
            for i, (id_, fp) in enumerate(column.iteritems(), start=1):
                # QIIME 2 represents empty cells as np.nan once normalized
                if pd.isna(fp):
                    raise ValidationError(
                        'Missing filepath on line %d and column "%s".' %
                        (i, column_name))
                if not os.path.exists(os.path.expandvars(fp)):
                    raise ValidationError(
                        'Filepath on line %d and column "%s" could not '
                        'be found (%s) for sample "%s".' %
                        (i, column_name, fp, id_))
                if fp in filepaths:
                    old_id, old_col_name, old_row = filepaths[fp]
                    raise ValidationError(
                        'Filepath on line %d and column "%s" (sample "%s") '
                        'has already been registered on line %d and column '
                        '"%s" (sample "%s").' %
                        (i, column_name, id_, old_row, old_col_name, old_id))
                else:
                    filepaths[fp] = (id_, column_name, i)
Ejemplo n.º 29
0
    def _check_n_records(self, n=None):
        with self.open() as fh:
            data_line_count = 0
            header = None

            file_ = enumerate(fh) if n is None else zip(range(n), fh)

            for i, line in file_:
                # Tracks line number for error reporting
                i = i + 1

                if line.lstrip(' ') == '\n':
                    # Blank line
                    continue

                cells = line.strip('\n').split('\t')

                if header is None:
                    if cells[:2] != self.HEADER:
                        raise ValidationError(
                            '%s must be the first two header values. The '
                            'first two header values provided are: %s (on '
                            'line %s).' % (self.HEADER, cells[:2], i))
                    header = cells
                else:
                    if len(cells) != len(header):
                        raise ValidationError(
                            'Number of values on line %s are not the same as '
                            'number of header values. Found %s values '
                            '(%s), expected %s.' % (i, len(cells), cells,
                                                    len(self.HEADER)))

                    data_line_count += 1

            if data_line_count == 0:
                raise ValidationError('No taxonomy records found, only blank '
                                      'lines and/or a header row.')
Ejemplo n.º 30
0
 def _validate_(self, level):
     if level == 'min':
         nrows = 8
     elif level == 'max':
         nrows = None
     else:
         raise ValueError('Nonstandard level value')
     series = pd.read_table(self.path,
                            header=None,
                            squeeze=True,
                            nrows=nrows,
                            index_col=0)
     if type(series) != pd.Series:
         raise ValidationError('File has more than one column: %s' %
                               series.head())