def validate_number(value, schema_rules): num_value = value try: num_value = float(value) except ValueError: raise ValidationError("{} must be a number".format(value)) if "minimum" in schema_rules and num_value <= schema_rules["minimum"]: raise ValidationError( "{} must be larger than the minimum value: {}".format(value, schema_rules["minimum"]) ) if "maximum" in schema_rules and num_value >= schema_rules["maximum"]: raise ValidationError( "{} must be smaller than the maximum value: {}".format(value, schema_rules["maximum"]) ) return num_value
def validate_boolean(value): if value.lower() in truthy_values(): return True elif value.lower() in falsy_values(): return False else: raise ValidationError('{} must be either "true" or "false"'.format(value))
def fastfastq(file_path): filename, ext = os.path.splitext(file_path) if ext in {".gz", ".gzip"}: open_func = gzip.open elif ext in {".bz", ".bz2", ".bzip", ".bzip2"}: open_func = bz2.open else: open_func = io.open with open_func(file_path, "rb") as fp: idx = 0 buf = [] for line in fp: idx += 1 buf.append(line) if len(buf) == 4: buf = b"".join(buf) if not buf.startswith(b"@"): raise ValidationError( "FASTQ record line {} does not start with @".format( idx)) yield buf buf = []
def validate_datetime(value): if not is_iso_8601_compliant(value): raise ValidationError( '"{}" must be formatted in iso8601 compliant date format. Example: "2018-05-15T16:21:36+00:00"' .format(value)) return value
def close(self): # did we read everything? if self.bytes_left != 0: raise ValidationError('Failed to properly read file: {}/{} bytes unread.'.format( self.bytes_left, self.total_size)) # actually close the file self.file_obj.close()
def _set_read(self, file_obj): self.reads = file_obj self.reads.seek(0) assert self.reads.tell() == 0 self.total_size = os.fstat(self.reads.fileno()).st_size if self.total_size < 70: raise ValidationError('{} is too small to be analyzed: {} bytes'.format( self.reads.name, self.total_size))
def validate_tags(appendables, api): if "tags" not in appendables: return tag_array = appendables["tags"] for tag in tag_array: name_property = api.Tags._resource._schema["properties"]["name"] if len(tag) > name_property.get("maxLength", 1000): raise ValidationError("{} is too long".format(tag)) appendables["valid_tags"].append({"name": tag})
def validate_tags(appendables, api): if 'tags' not in appendables: return tag_array = appendables['tags'] for tag in tag_array: name_property = api.Tags._resource._schema['properties']['name'] if len(tag) > name_property.get('maxLength', 1000): raise ValidationError('{} is too long'.format(tag)) appendables['valid_tags'].append({'name': tag})
def validate_enum(value, schema_rules): if value not in schema_rules['enum']: # This is gross, but is necessary for string comparison between Python2 ([u'Illumina HiSeq']) and Python3 (['Illumina Hiseq']). On the plus side, it makes the error message more Human Readable. error_array = [] for rule in schema_rules['enum']: if rule: error_array.append(str(rule)) else: error_array.append(rule) raise ValidationError( '{} is not a valid value for this key. Value must be one of the following options: {}' .format(value, error_array)) return value
def validate_metadata(appendables, api): if "metadata" not in appendables: return schema_props = metadata_properties(api) for key, value in appendables["metadata"].items(): if is_blacklisted(key): raise ValidationError("{} cannot be manually updated".format(key)) if key in schema_props.keys(): settable_value = validate_metadata_against_schema(schema_props, key, value) appendables["valid_metadata"][key] = settable_value else: coerced_value = coerce_custom_value(value) appendables["valid_metadata"]["custom"][key] = coerced_value
def read(self, n=-1): if self.reads_pair is None: while len(self.checked_buffer) < n or n < 0: try: record = next(self.reads_iter) except StopIteration: record = None if record is not None: self.checked_buffer.write(record) elif record is None: self.checked_buffer.close() break if self.progress_callback is not None: self.progress_callback(self.reads.name, self.reads.processed_size, validation=(not self.reads.validate)) else: while len(self.checked_buffer) < n or n < 0: try: record = next(self.reads_iter) except StopIteration: record = None try: record_pair = next(self.reads_pair_iter) except StopIteration: record_pair = None if record is not None and record_pair is not None: self.checked_buffer.write(record) self.checked_buffer.write(record_pair) elif record is None and record_pair is None: self.checked_buffer.close() break else: raise ValidationError("Paired read files do not have the " "same number of records") if self.progress_callback is not None: bytes_uploaded = self.reads.processed_size + self.reads_pair.processed_size self.progress_callback(self.reads.name, bytes_uploaded, validation=(not self.reads.validate)) bytes_reads = self.checked_buffer.read(n) self.total_written += len(bytes_reads) return bytes_reads
def _set_file_obj(self, file_obj, check_filename=True): """ Transparently decompress files and determine what kind of file they are (FASTA/Q). """ if not hasattr(file_obj, 'name'): # can't do the checks if there's not filename check_filename = False # detect if gzipped/bzipped and uncompress transparently start = file_obj.read(1) if start == b'\x1f': if check_filename and not file_obj.name.endswith(('.gz', '.gzip')): raise ValidationError('{} is gzipped, but lacks a ".gz" ending'.format(self.name)) file_obj.seek(0) file_obj = gzip.GzipFile(fileobj=file_obj) start = file_obj.read(1) elif start == b'\x42' and hasattr(bz2, 'open'): if check_filename and not file_obj.name.endswith(('.bz2', '.bz', '.bzip')): raise ValidationError('{} is bzipped, but lacks a ".bz2" ending'.format(self.name)) # we can only read BZ2 files in python 3.3 and above file_obj.seek(0) patched_name = file_obj.name file_obj = bz2.open(file_obj) file_obj.name = patched_name start = file_obj.read(1) elif check_filename and file_obj.name.endswith(('.gz', '.gzip')): raise ValidationError('{} is not gzipped but has a ".gz" file extension.'.format(self.name)) elif check_filename and file_obj.name.endswith(('.bz2', '.bz', '.bzip')): raise ValidationError('{} is not gzipped but has a ".bz2" file extension.'.format(self.name)) # determine if a FASTQ or a FASTA if start == b'>': self.file_type = 'FASTA' if check_filename and not ('.fa' in file_obj.name or '.fna' in file_obj.name or '.fasta' in file_obj.name): raise ValidationError('{} is FASTA, but lacks a ".fa" ending'.format(self.name)) elif start == b'@': self.file_type = 'FASTQ' if check_filename and not ('.fq' in file_obj.name or '.fastq' in file_obj.name): raise ValidationError('{} is FASTQ, but lacks a ".fq" ending'.format(self.name)) else: raise ValidationError('{} is not valid FASTX'.format(self.name)) self.file_obj = file_obj
def _set_total_size(self): if isinstance(self.file_obj, BytesIO): self.file_obj.seek(0) self.total_size = len(self.file_obj.read()) self.file_obj.seek(1) else: try: self.total_size = os.fstat(self.file_obj.fileno()).st_size if self.total_size < 70: raise ValidationError('{} is too small to be analyzed: {} bytes'.format( self.name, self.total_size )) except IOError: pass # Set the buffer size, 16MB by default for files >32MB if self.total_size >= (1024 * 1024 * 32): self.buffer_read_size = 1024 * 1024 * 16 # 16MB else: self.buffer_read_size = 1024 * 16 # 16KB small chunk
def _validate_record(self, rec): # TODO: if there are quality scores, make sure they're in range # FIXME: fail if reads aren't interleaved and an override flag isn't passed? seq_id, seq, seq_id2, qual = rec['id'], rec['seq'], rec.get('id2', b''), rec.get('qual') if not self.validate: return seq_id, seq, seq_id2, qual if b'\t' in seq_id or b'\t' in seq_id2: self._warn_once('{} can not have tabs in headers; autoreplacing'.format(self.name)) seq_id = seq_id.replace(b'\t', b'|') # Match then search is ~5-10% faster than just searching if not bool(self.valid_bases_match.match(seq)): chars = b','.join(set(self.valid_bases.findall(seq))) raise ValidationError('{} contains non-nucleic acid characters: {}'.format(self.name, chars)) # Only search for OTHER_BASES if we're allowing them above in the first place if self.allow_iupac and OTHER_BASES.search(seq) is not None: self._warn_once('Translating other bases in {} (X->N,U->T)'.format(self.name)) seq = seq.translate(OTHER_BASE_TRANS) return seq_id, seq, seq_id2, qual
def cli( ctx, classification_id, fastx, reverse, tax_ids, with_children, subset_pairs_independently, exclude_reads, include_lowconf, out, validate, ): if ctx.info_name == "filter_reads": warnings.warn( "filter_reads will be removed in a future version. Please use subset_reads instead!" ) if not len(tax_ids): raise OneCodexException("You must supply at least one tax ID") # fetch classification result object from API classification = ctx.obj["API"].Classifications.get(classification_id) if classification is None: raise ValidationError( "Classification {} not found.".format(classification_id)) # if with children, expand tax_ids by referring to the taxonomic tree if with_children: tax_id_map = make_taxonomy_dict(classification) new_tax_ids = [] for t_id in tax_ids: new_tax_ids.extend(recurse_taxonomy_map(tax_id_map, t_id)) tax_ids = new_tax_ids tax_ids = set(tax_ids) # pull the classification result TSV tsv_url = classification._readlevel()["url"] readlevel_path = get_download_dest("./", tsv_url) if not os.path.exists(readlevel_path): download_file_helper(tsv_url, "./") else: click.echo( "Using cached read-level results: {}".format(readlevel_path), err=True) # count the number of rows in the TSV file with gzip.open(readlevel_path, "rt") as tsv: try: tsv_row_count = 0 for _ in tsv: tsv_row_count += 1 tsv_row_count -= 1 # discount header line except EOFError: click.echo( "\nWe encountered an error while processing the read " "level results. Please delete {} and try again.".format( readlevel_path), err=True, ) raise if reverse: if tsv_row_count % 2 != 0: raise ValidationError( "Classification results cannot have odd number of records if using --reverse/-r" ) tsv_row_count = int(tsv_row_count / 2.0) # determine the name of the output file(s) filtered_filename, ext = get_filtered_filename(fastx) filtered_filename = os.path.join(out, filtered_filename) if reverse: rev_filtered_filename = get_filtered_filename(reverse)[0] rev_filtered_filename = os.path.join(out, rev_filtered_filename) if ext in {".fa", ".fna", ".fasta"}: io_kwargs = {"format": "fasta"} elif ext in {".fq", ".fastq"}: io_kwargs = {"format": "fastq", "variant": "illumina1.8"} else: raise OneCodexException( "{}: extension must be one of .fa, .fna, .fasta, .fq, .fastq". format(fastx)) # do the actual filtering save_msg = "Saving subsetted reads: {}".format(filtered_filename) if reverse: save_msg += " and {}".format(rev_filtered_filename) click.echo(save_msg, err=True) # see mainline/#3513. we must set idx=0 here for cases where the fastx file is empty idx = 0 with click.progressbar(length=tsv_row_count) as bar, gzip.open( readlevel_path, "rt") as tsv: reader = csv.DictReader(tsv, delimiter="\t") if reverse: if not validate and io_kwargs["format"] == "fastq": fwd_iter = fastfastq(fastx) rev_iter = fastfastq(reverse) else: fwd_iter = validating_parser(fastx, **io_kwargs) rev_iter = validating_parser(reverse, **io_kwargs) with io.open(filtered_filename, "wb") as out_file, io.open( rev_filtered_filename, "wb") as rev_out_file: # noqa for idx, (fwd, rev) in enumerate(zip(fwd_iter, rev_iter)): if idx == tsv_row_count: too_many_fastx_records() if idx % 1000 == 0: bar.update(1000) row = next( reader) # necessary to do it this way for py2 compat row2 = next(reader) if subset_pairs_independently: if include_lowconf: if exclude_reads: if row["Tax ID"] not in tax_ids: out_file.write(fwd) if row2["Tax ID"] not in tax_ids: rev_out_file.write(rev) else: if row["Tax ID"] in tax_ids: out_file.write(fwd) if row2["Tax ID"] in tax_ids: rev_out_file.write(rev) else: if exclude_reads: if (row.get("Passed Filter", "T") == "T" and row["Tax ID"] not in tax_ids): out_file.write(fwd) if (row2.get("Passed Filter", "T") == "T" and row2["Tax ID"] not in tax_ids): rev_out_file.write(rev) else: if (row.get("Passed Filter", "T") == "T" and row["Tax ID"] in tax_ids): out_file.write(fwd) if (row2.get("Passed Filter", "T") == "T" and row2["Tax ID"] in tax_ids): rev_out_file.write(rev) else: if include_lowconf: if exclude_reads: if row["Tax ID"] not in tax_ids or row2[ "Tax ID"] not in tax_ids: out_file.write(fwd) rev_out_file.write(rev) else: if row["Tax ID"] in tax_ids or row2[ "Tax ID"] in tax_ids: out_file.write(fwd) rev_out_file.write(rev) else: if exclude_reads: if (row.get("Passed Filter", "T") == "T" and row["Tax ID"] not in tax_ids ) or (row2.get("Passed Filter", "T") == "T" and row2["Tax ID"] not in tax_ids): out_file.write(fwd) rev_out_file.write(rev) else: if (row.get("Passed Filter", "T") == "T" and row["Tax ID"] in tax_ids ) or (row2.get("Passed Filter", "T") == "T" and row2["Tax ID"] in tax_ids): out_file.write(fwd) rev_out_file.write(rev) else: if not validate and io_kwargs["format"] == "fastq": fwd_iter = fastfastq(fastx) else: fwd_iter = validating_parser(fastx, **io_kwargs) with io.open(filtered_filename, "wb") as out_file: for idx, (fwd, row) in enumerate(zip(fwd_iter, reader)): if idx == tsv_row_count: too_many_fastx_records() if idx % 1000 == 0: bar.update(1000) if include_lowconf: if exclude_reads: if row["Tax ID"] not in tax_ids: out_file.write(fwd) else: if row["Tax ID"] in tax_ids: out_file.write(fwd) else: if exclude_reads: if (row.get("Passed Filter", "T") == "T" and row["Tax ID"] not in tax_ids): out_file.write(fwd) else: if row.get( "Passed Filter", "T") == "T" and row["Tax ID"] in tax_ids: out_file.write(fwd) if idx < tsv_row_count - 1: # 0-based idx, 1-based tsv_row_count raise ValidationError( "FASTX file(s) provided have fewer records than the classification results" ) bar.finish()
def cli(ctx, classification_id, fastx, reverse, tax_id, split_pairs, out): tax_ids = tax_id # rename if not len(tax_ids): raise OneCodexException('You must supply at least one tax ID') classification = ctx.obj['API'].Classifications.get(classification_id) if classification is None: raise ValidationError( 'Classification {} not found.'.format(classification_id)) tsv_url = classification.readlevel()['url'] readlevel_path = get_download_dest('./', tsv_url) if not os.path.exists(readlevel_path): download_file_helper(tsv_url, './') else: click.echo( 'Using cached read-level results: {}'.format(readlevel_path), err=True) filtered_rows = [] tsv_row_count = 0 with gzip.open(readlevel_path, 'rt') as tsv: try: tsv_row_count = get_record_count(tsv) - 1 # discount header line except EOFError: click.echo('\nWe encountered an error while processing the read ' 'level results. Please delete {} and try again.'.format( readlevel_path), err=True) raise else: tsv.seek(0) reader = csv.DictReader(tsv, delimiter='\t') click.echo('Selecting results matching tax ID(s): {}'.format( ', '.join(tax_ids)), err=True) filtered_rows = with_progress_bar(tsv_row_count, filter_rows_by_taxid, reader, tax_ids) filtered_filename = get_filtered_filename(fastx)[0] filtered_filename = os.path.join(out, filtered_filename) if reverse: rev_filtered_filename = get_filtered_filename(reverse)[0] rev_filtered_filename = os.path.join(out, rev_filtered_filename) fastx_record_count = 0 with open(fastx, 'rb') as fastx_file: try: fastx_record_count = get_record_count( FASTXTranslator(fastx_file, validate=False)) except ValidationError as e: raise OneCodexException(e.message) if reverse: fastx_record_count = fastx_record_count * 2 if tsv_row_count != fastx_record_count: os.remove(readlevel_path) raise OneCodexException('The supplied file has a different number of ' 'records than the requested Classification') save_msg = 'Saving filtered reads: {}'.format(filtered_filename) if reverse: save_msg += ' and {}'.format(rev_filtered_filename) click.echo(save_msg, err=True) counter = 0 if reverse: with open(fastx, 'rb') as fastx_file, \ open(reverse, 'rb') as reverse_file, \ open(filtered_filename, 'wb') as out_file, \ open(rev_filtered_filename, 'wb') as rev_out_file: if split_pairs: for fwd, rev in FASTXTranslator(fastx_file, reverse_file, validate=False): if counter in filtered_rows: out_file.write(fwd) if (counter + 1) in filtered_rows: rev_out_file.write(rev) counter += 2 else: for fwd, rev in FASTXTranslator(fastx_file, reverse_file, validate=False): if counter in filtered_rows or \ (counter + 1) in filtered_rows: out_file.write(fwd) rev_out_file.write(rev) counter += 2 else: with open(fastx, 'rb') as fastx_file, \ open(filtered_filename, 'wb') as out_file: for seq in FASTXTranslator(fastx_file, validate=False): if counter in filtered_rows: out_file.write(seq) counter += 1
def too_many_fastx_records(): raise ValidationError( "FASTX file(s) provided have more records than the classification results" )
def _set_pair(self, pair, **kwargs): self.reads_pair = FASTXNuclIterator(pair, **kwargs) self.reads_pair_iter = iter(self.reads_pair) if self.reads.file_type != self.reads_pair.file_type: raise ValidationError('Paired read files are different types (FASTA/FASTQ)')