Beispiel #1
0
def validate_number(value, schema_rules):
    num_value = value
    try:
        num_value = float(value)
    except ValueError:
        raise ValidationError("{} must be a number".format(value))
    if "minimum" in schema_rules and num_value <= schema_rules["minimum"]:
        raise ValidationError(
            "{} must be larger than the minimum value: {}".format(value, schema_rules["minimum"])
        )
    if "maximum" in schema_rules and num_value >= schema_rules["maximum"]:
        raise ValidationError(
            "{} must be smaller than the maximum value: {}".format(value, schema_rules["maximum"])
        )
    return num_value
Beispiel #2
0
def validate_boolean(value):
    if value.lower() in truthy_values():
        return True
    elif value.lower() in falsy_values():
        return False
    else:
        raise ValidationError('{} must be either "true" or "false"'.format(value))
Beispiel #3
0
def fastfastq(file_path):
    filename, ext = os.path.splitext(file_path)

    if ext in {".gz", ".gzip"}:
        open_func = gzip.open
    elif ext in {".bz", ".bz2", ".bzip", ".bzip2"}:
        open_func = bz2.open
    else:
        open_func = io.open

    with open_func(file_path, "rb") as fp:
        idx = 0
        buf = []

        for line in fp:
            idx += 1
            buf.append(line)

            if len(buf) == 4:
                buf = b"".join(buf)

                if not buf.startswith(b"@"):
                    raise ValidationError(
                        "FASTQ record line {} does not start with @".format(
                            idx))

                yield buf

                buf = []
Beispiel #4
0
def validate_datetime(value):
    if not is_iso_8601_compliant(value):
        raise ValidationError(
            '"{}" must be formatted in iso8601 compliant date format. Example: "2018-05-15T16:21:36+00:00"'
            .format(value))

    return value
Beispiel #5
0
    def close(self):
        # did we read everything?
        if self.bytes_left != 0:
            raise ValidationError('Failed to properly read file: {}/{} bytes unread.'.format(
                self.bytes_left, self.total_size))

        # actually close the file
        self.file_obj.close()
Beispiel #6
0
 def _set_read(self, file_obj):
     self.reads = file_obj
     self.reads.seek(0)
     assert self.reads.tell() == 0
     self.total_size = os.fstat(self.reads.fileno()).st_size
     if self.total_size < 70:
         raise ValidationError('{} is too small to be analyzed: {} bytes'.format(
                               self.reads.name, self.total_size))
Beispiel #7
0
def validate_tags(appendables, api):
    if "tags" not in appendables:
        return

    tag_array = appendables["tags"]
    for tag in tag_array:
        name_property = api.Tags._resource._schema["properties"]["name"]
        if len(tag) > name_property.get("maxLength", 1000):
            raise ValidationError("{} is too long".format(tag))

        appendables["valid_tags"].append({"name": tag})
Beispiel #8
0
def validate_tags(appendables, api):
    if 'tags' not in appendables:
        return

    tag_array = appendables['tags']
    for tag in tag_array:
        name_property = api.Tags._resource._schema['properties']['name']
        if len(tag) > name_property.get('maxLength', 1000):
            raise ValidationError('{} is too long'.format(tag))

        appendables['valid_tags'].append({'name': tag})
Beispiel #9
0
def validate_enum(value, schema_rules):
    if value not in schema_rules['enum']:
        # This is gross, but is necessary for string comparison between Python2 ([u'Illumina HiSeq']) and Python3 (['Illumina Hiseq']). On the plus side, it makes the error message more Human Readable.
        error_array = []
        for rule in schema_rules['enum']:
            if rule:
                error_array.append(str(rule))
            else:
                error_array.append(rule)
        raise ValidationError(
            '{} is not a valid value for this key. Value must be one of the following options: {}'
            .format(value, error_array))
    return value
Beispiel #10
0
def validate_metadata(appendables, api):
    if "metadata" not in appendables:
        return

    schema_props = metadata_properties(api)
    for key, value in appendables["metadata"].items():
        if is_blacklisted(key):
            raise ValidationError("{} cannot be manually updated".format(key))

        if key in schema_props.keys():
            settable_value = validate_metadata_against_schema(schema_props, key, value)
            appendables["valid_metadata"][key] = settable_value
        else:
            coerced_value = coerce_custom_value(value)
            appendables["valid_metadata"]["custom"][key] = coerced_value
Beispiel #11
0
    def read(self, n=-1):
        if self.reads_pair is None:
            while len(self.checked_buffer) < n or n < 0:
                try:
                    record = next(self.reads_iter)
                except StopIteration:
                    record = None

                if record is not None:
                    self.checked_buffer.write(record)
                elif record is None:
                    self.checked_buffer.close()
                    break

                if self.progress_callback is not None:
                    self.progress_callback(self.reads.name, self.reads.processed_size,
                                           validation=(not self.reads.validate))
        else:
            while len(self.checked_buffer) < n or n < 0:
                try:
                    record = next(self.reads_iter)
                except StopIteration:
                    record = None
                try:
                    record_pair = next(self.reads_pair_iter)
                except StopIteration:
                    record_pair = None

                if record is not None and record_pair is not None:
                    self.checked_buffer.write(record)
                    self.checked_buffer.write(record_pair)
                elif record is None and record_pair is None:
                    self.checked_buffer.close()
                    break
                else:
                    raise ValidationError("Paired read files do not have the "
                                          "same number of records")

                if self.progress_callback is not None:
                    bytes_uploaded = self.reads.processed_size + self.reads_pair.processed_size
                    self.progress_callback(self.reads.name, bytes_uploaded,
                                           validation=(not self.reads.validate))

        bytes_reads = self.checked_buffer.read(n)
        self.total_written += len(bytes_reads)
        return bytes_reads
Beispiel #12
0
    def _set_file_obj(self, file_obj, check_filename=True):
        """
        Transparently decompress files and determine what kind of file they are (FASTA/Q).
        """
        if not hasattr(file_obj, 'name'):
            # can't do the checks if there's not filename
            check_filename = False

        # detect if gzipped/bzipped and uncompress transparently
        start = file_obj.read(1)
        if start == b'\x1f':
            if check_filename and not file_obj.name.endswith(('.gz', '.gzip')):
                raise ValidationError('{} is gzipped, but lacks a ".gz" ending'.format(self.name))
            file_obj.seek(0)
            file_obj = gzip.GzipFile(fileobj=file_obj)
            start = file_obj.read(1)
        elif start == b'\x42' and hasattr(bz2, 'open'):
            if check_filename and not file_obj.name.endswith(('.bz2', '.bz', '.bzip')):
                raise ValidationError('{} is bzipped, but lacks a ".bz2" ending'.format(self.name))
            # we can only read BZ2 files in python 3.3 and above
            file_obj.seek(0)
            patched_name = file_obj.name
            file_obj = bz2.open(file_obj)
            file_obj.name = patched_name
            start = file_obj.read(1)
        elif check_filename and file_obj.name.endswith(('.gz', '.gzip')):
            raise ValidationError('{} is not gzipped but has a ".gz" file extension.'.format(self.name))
        elif check_filename and file_obj.name.endswith(('.bz2', '.bz', '.bzip')):
            raise ValidationError('{} is not gzipped but has a ".bz2" file extension.'.format(self.name))

        # determine if a FASTQ or a FASTA
        if start == b'>':
            self.file_type = 'FASTA'
            if check_filename and not ('.fa' in file_obj.name or
                                       '.fna' in file_obj.name or
                                       '.fasta' in file_obj.name):
                raise ValidationError('{} is FASTA, but lacks a ".fa" ending'.format(self.name))
        elif start == b'@':
            self.file_type = 'FASTQ'
            if check_filename and not ('.fq' in file_obj.name or
                                       '.fastq' in file_obj.name):
                raise ValidationError('{} is FASTQ, but lacks a ".fq" ending'.format(self.name))
        else:
            raise ValidationError('{} is not valid FASTX'.format(self.name))

        self.file_obj = file_obj
Beispiel #13
0
    def _set_total_size(self):
        if isinstance(self.file_obj, BytesIO):
            self.file_obj.seek(0)
            self.total_size = len(self.file_obj.read())
            self.file_obj.seek(1)
        else:
            try:
                self.total_size = os.fstat(self.file_obj.fileno()).st_size
                if self.total_size < 70:
                    raise ValidationError('{} is too small to be analyzed: {} bytes'.format(
                        self.name, self.total_size
                    ))
            except IOError:
                pass

        # Set the buffer size, 16MB by default for files >32MB
        if self.total_size >= (1024 * 1024 * 32):
            self.buffer_read_size = 1024 * 1024 * 16  # 16MB
        else:
            self.buffer_read_size = 1024 * 16  # 16KB small chunk
Beispiel #14
0
    def _validate_record(self, rec):
        # TODO: if there are quality scores, make sure they're in range
        # FIXME: fail if reads aren't interleaved and an override flag isn't passed?
        seq_id, seq, seq_id2, qual = rec['id'], rec['seq'], rec.get('id2', b''), rec.get('qual')
        if not self.validate:
            return seq_id, seq, seq_id2, qual

        if b'\t' in seq_id or b'\t' in seq_id2:
            self._warn_once('{} can not have tabs in headers; autoreplacing'.format(self.name))
            seq_id = seq_id.replace(b'\t', b'|')

        # Match then search is ~5-10% faster than just searching
        if not bool(self.valid_bases_match.match(seq)):
            chars = b','.join(set(self.valid_bases.findall(seq)))
            raise ValidationError('{} contains non-nucleic acid characters: {}'.format(self.name,
                                                                                       chars))

        # Only search for OTHER_BASES if we're allowing them above in the first place
        if self.allow_iupac and OTHER_BASES.search(seq) is not None:
            self._warn_once('Translating other bases in {} (X->N,U->T)'.format(self.name))
            seq = seq.translate(OTHER_BASE_TRANS)

        return seq_id, seq, seq_id2, qual
Beispiel #15
0
def cli(
    ctx,
    classification_id,
    fastx,
    reverse,
    tax_ids,
    with_children,
    subset_pairs_independently,
    exclude_reads,
    include_lowconf,
    out,
    validate,
):
    if ctx.info_name == "filter_reads":
        warnings.warn(
            "filter_reads will be removed in a future version. Please use subset_reads instead!"
        )

    if not len(tax_ids):
        raise OneCodexException("You must supply at least one tax ID")

    # fetch classification result object from API
    classification = ctx.obj["API"].Classifications.get(classification_id)
    if classification is None:
        raise ValidationError(
            "Classification {} not found.".format(classification_id))

    # if with children, expand tax_ids by referring to the taxonomic tree
    if with_children:
        tax_id_map = make_taxonomy_dict(classification)

        new_tax_ids = []

        for t_id in tax_ids:
            new_tax_ids.extend(recurse_taxonomy_map(tax_id_map, t_id))

        tax_ids = new_tax_ids

    tax_ids = set(tax_ids)

    # pull the classification result TSV
    tsv_url = classification._readlevel()["url"]
    readlevel_path = get_download_dest("./", tsv_url)
    if not os.path.exists(readlevel_path):
        download_file_helper(tsv_url, "./")
    else:
        click.echo(
            "Using cached read-level results: {}".format(readlevel_path),
            err=True)

    # count the number of rows in the TSV file
    with gzip.open(readlevel_path, "rt") as tsv:
        try:
            tsv_row_count = 0
            for _ in tsv:
                tsv_row_count += 1
            tsv_row_count -= 1  # discount header line
        except EOFError:
            click.echo(
                "\nWe encountered an error while processing the read "
                "level results. Please delete {} and try again.".format(
                    readlevel_path),
                err=True,
            )
            raise

    if reverse:
        if tsv_row_count % 2 != 0:
            raise ValidationError(
                "Classification results cannot have odd number of records if using --reverse/-r"
            )

        tsv_row_count = int(tsv_row_count / 2.0)

    # determine the name of the output file(s)
    filtered_filename, ext = get_filtered_filename(fastx)
    filtered_filename = os.path.join(out, filtered_filename)
    if reverse:
        rev_filtered_filename = get_filtered_filename(reverse)[0]
        rev_filtered_filename = os.path.join(out, rev_filtered_filename)

    if ext in {".fa", ".fna", ".fasta"}:
        io_kwargs = {"format": "fasta"}
    elif ext in {".fq", ".fastq"}:
        io_kwargs = {"format": "fastq", "variant": "illumina1.8"}
    else:
        raise OneCodexException(
            "{}: extension must be one of .fa, .fna, .fasta, .fq, .fastq".
            format(fastx))

    # do the actual filtering
    save_msg = "Saving subsetted reads: {}".format(filtered_filename)
    if reverse:
        save_msg += " and {}".format(rev_filtered_filename)
    click.echo(save_msg, err=True)

    # see mainline/#3513. we must set idx=0 here for cases where the fastx file is empty
    idx = 0

    with click.progressbar(length=tsv_row_count) as bar, gzip.open(
            readlevel_path, "rt") as tsv:
        reader = csv.DictReader(tsv, delimiter="\t")

        if reverse:
            if not validate and io_kwargs["format"] == "fastq":
                fwd_iter = fastfastq(fastx)
                rev_iter = fastfastq(reverse)
            else:
                fwd_iter = validating_parser(fastx, **io_kwargs)
                rev_iter = validating_parser(reverse, **io_kwargs)

            with io.open(filtered_filename, "wb") as out_file, io.open(
                    rev_filtered_filename, "wb") as rev_out_file:  # noqa

                for idx, (fwd, rev) in enumerate(zip(fwd_iter, rev_iter)):
                    if idx == tsv_row_count:
                        too_many_fastx_records()
                    if idx % 1000 == 0:
                        bar.update(1000)
                    row = next(
                        reader)  # necessary to do it this way for py2 compat
                    row2 = next(reader)

                    if subset_pairs_independently:
                        if include_lowconf:
                            if exclude_reads:
                                if row["Tax ID"] not in tax_ids:
                                    out_file.write(fwd)
                                if row2["Tax ID"] not in tax_ids:
                                    rev_out_file.write(rev)
                            else:
                                if row["Tax ID"] in tax_ids:
                                    out_file.write(fwd)
                                if row2["Tax ID"] in tax_ids:
                                    rev_out_file.write(rev)
                        else:
                            if exclude_reads:
                                if (row.get("Passed Filter", "T") == "T"
                                        and row["Tax ID"] not in tax_ids):
                                    out_file.write(fwd)
                                if (row2.get("Passed Filter", "T") == "T"
                                        and row2["Tax ID"] not in tax_ids):
                                    rev_out_file.write(rev)
                            else:
                                if (row.get("Passed Filter", "T") == "T"
                                        and row["Tax ID"] in tax_ids):
                                    out_file.write(fwd)
                                if (row2.get("Passed Filter", "T") == "T"
                                        and row2["Tax ID"] in tax_ids):
                                    rev_out_file.write(rev)
                    else:
                        if include_lowconf:
                            if exclude_reads:
                                if row["Tax ID"] not in tax_ids or row2[
                                        "Tax ID"] not in tax_ids:
                                    out_file.write(fwd)
                                    rev_out_file.write(rev)
                            else:
                                if row["Tax ID"] in tax_ids or row2[
                                        "Tax ID"] in tax_ids:
                                    out_file.write(fwd)
                                    rev_out_file.write(rev)
                        else:
                            if exclude_reads:
                                if (row.get("Passed Filter", "T") == "T"
                                        and row["Tax ID"] not in tax_ids
                                    ) or (row2.get("Passed Filter", "T") == "T"
                                          and row2["Tax ID"] not in tax_ids):
                                    out_file.write(fwd)
                                    rev_out_file.write(rev)
                            else:
                                if (row.get("Passed Filter", "T") == "T"
                                        and row["Tax ID"] in tax_ids
                                    ) or (row2.get("Passed Filter", "T") == "T"
                                          and row2["Tax ID"] in tax_ids):
                                    out_file.write(fwd)
                                    rev_out_file.write(rev)
        else:
            if not validate and io_kwargs["format"] == "fastq":
                fwd_iter = fastfastq(fastx)
            else:
                fwd_iter = validating_parser(fastx, **io_kwargs)

            with io.open(filtered_filename, "wb") as out_file:
                for idx, (fwd, row) in enumerate(zip(fwd_iter, reader)):
                    if idx == tsv_row_count:
                        too_many_fastx_records()
                    if idx % 1000 == 0:
                        bar.update(1000)
                    if include_lowconf:
                        if exclude_reads:
                            if row["Tax ID"] not in tax_ids:
                                out_file.write(fwd)
                        else:
                            if row["Tax ID"] in tax_ids:
                                out_file.write(fwd)
                    else:
                        if exclude_reads:
                            if (row.get("Passed Filter", "T") == "T"
                                    and row["Tax ID"] not in tax_ids):
                                out_file.write(fwd)
                        else:
                            if row.get(
                                    "Passed Filter",
                                    "T") == "T" and row["Tax ID"] in tax_ids:
                                out_file.write(fwd)

        if idx < tsv_row_count - 1:  # 0-based idx, 1-based tsv_row_count
            raise ValidationError(
                "FASTX file(s) provided have fewer records than the classification results"
            )

        bar.finish()
Beispiel #16
0
def cli(ctx, classification_id, fastx, reverse, tax_id, split_pairs, out):
    tax_ids = tax_id  # rename
    if not len(tax_ids):
        raise OneCodexException('You must supply at least one tax ID')

    classification = ctx.obj['API'].Classifications.get(classification_id)
    if classification is None:
        raise ValidationError(
            'Classification {} not found.'.format(classification_id))

    tsv_url = classification.readlevel()['url']
    readlevel_path = get_download_dest('./', tsv_url)
    if not os.path.exists(readlevel_path):
        download_file_helper(tsv_url, './')
    else:
        click.echo(
            'Using cached read-level results: {}'.format(readlevel_path),
            err=True)

    filtered_rows = []
    tsv_row_count = 0
    with gzip.open(readlevel_path, 'rt') as tsv:
        try:
            tsv_row_count = get_record_count(tsv) - 1  # discount header line
        except EOFError:
            click.echo('\nWe encountered an error while processing the read '
                       'level results. Please delete {} and try again.'.format(
                           readlevel_path),
                       err=True)
            raise
        else:
            tsv.seek(0)
            reader = csv.DictReader(tsv, delimiter='\t')
            click.echo('Selecting results matching tax ID(s): {}'.format(
                ', '.join(tax_ids)),
                       err=True)
            filtered_rows = with_progress_bar(tsv_row_count,
                                              filter_rows_by_taxid, reader,
                                              tax_ids)

    filtered_filename = get_filtered_filename(fastx)[0]
    filtered_filename = os.path.join(out, filtered_filename)
    if reverse:
        rev_filtered_filename = get_filtered_filename(reverse)[0]
        rev_filtered_filename = os.path.join(out, rev_filtered_filename)

    fastx_record_count = 0
    with open(fastx, 'rb') as fastx_file:
        try:
            fastx_record_count = get_record_count(
                FASTXTranslator(fastx_file, validate=False))
        except ValidationError as e:
            raise OneCodexException(e.message)

    if reverse:
        fastx_record_count = fastx_record_count * 2

    if tsv_row_count != fastx_record_count:
        os.remove(readlevel_path)
        raise OneCodexException('The supplied file has a different number of '
                                'records than the requested Classification')

    save_msg = 'Saving filtered reads: {}'.format(filtered_filename)
    if reverse:
        save_msg += ' and {}'.format(rev_filtered_filename)
    click.echo(save_msg, err=True)

    counter = 0
    if reverse:
        with open(fastx, 'rb') as fastx_file, \
                open(reverse, 'rb') as reverse_file, \
                open(filtered_filename, 'wb') as out_file, \
                open(rev_filtered_filename, 'wb') as rev_out_file:
            if split_pairs:
                for fwd, rev in FASTXTranslator(fastx_file,
                                                reverse_file,
                                                validate=False):
                    if counter in filtered_rows:
                        out_file.write(fwd)
                    if (counter + 1) in filtered_rows:
                        rev_out_file.write(rev)
                    counter += 2
            else:
                for fwd, rev in FASTXTranslator(fastx_file,
                                                reverse_file,
                                                validate=False):
                    if counter in filtered_rows or \
                       (counter + 1) in filtered_rows:
                        out_file.write(fwd)
                        rev_out_file.write(rev)
                    counter += 2
    else:
        with open(fastx, 'rb') as fastx_file, \
                open(filtered_filename, 'wb') as out_file:
            for seq in FASTXTranslator(fastx_file, validate=False):
                if counter in filtered_rows:
                    out_file.write(seq)
                counter += 1
Beispiel #17
0
def too_many_fastx_records():
    raise ValidationError(
        "FASTX file(s) provided have more records than the classification results"
    )
Beispiel #18
0
 def _set_pair(self, pair, **kwargs):
     self.reads_pair = FASTXNuclIterator(pair, **kwargs)
     self.reads_pair_iter = iter(self.reads_pair)
     if self.reads.file_type != self.reads_pair.file_type:
         raise ValidationError('Paired read files are different types (FASTA/FASTQ)')