def open2(path1, path2): file1 = file2 = None if path1 is not None: file1 = xopen(path1, 'w') if path2 is not None: file2 = xopen(path2, 'w') return file1, file2
def run_pe_p7_bc(self): """ structure: |--_tmp |--index1 | |--barcode | | |--file.fq """ # step1. index1 outdir1 = os.path.join(self.outdir, '_tmp') with xopen(self.fq1, 'rt') as r1, xopen(self.fq2, 'rt') as r2: self.index_pe(r1, r2, outdir1) # rename files in dirs/ flist1 = self.wrap_dir(outdir1, mode='index1') flist1 = sorted(flist1) # sort, read1/read2 # step2. index2 # step3. barcode # multiple jobs n_jobs = min(self.parallel_jobs, len(flist1[0::2])) # number of jobs with Pool(processes=n_jobs) as pool: pool.map(self.run_pe_barcode_single, flist1[0::2]) # read1 # for fq1 in flist1[0::2]: # read1 # self.run_pe_barcode_single(fq1) ## step4. rename files self.wrap_read_count() self.wrap_file()
def __init__(self, model): if args.print_lr: t = K.cast(model.optimizer.iterations, K.floatx()) + 1 self.lr = K.switch( t <= model.optimizer.warmup_steps, model.optimizer.lr * (t / model.optimizer.warmup_steps), model.optimizer.min_lr + (model.optimizer.lr - model.optimizer.min_lr) * (1.0 - K.minimum(t, model.optimizer.decay_steps) / model.optimizer.decay_steps), ) self.best_f1 = 0 self.best_f1_epoch = 0 self.best_f1_threshold = 0 if args.label_mapping is not None: file_name = args.dev_all else: file_name = args.dev with xopen(file_name, "rt") as f: example_count, label_dim = json.loads(f.readline()) self.all_labels = lil_matrix((example_count, label_dim), dtype='b') for i, line in tqdm(enumerate(f), desc="Reading dev labels"): self.all_labels[i, json.loads(line)[1]] = 1 print("Dev labels shape:", self.all_labels.shape) if args.dev_all is not None: with xopen(args.label_mapping) as f: self.labels_mapping = json.loads(f.read())
def open2(path1, path2): file1 = file2 = None if path1 is not None: file1 = xopen(path1, 'wb', compresslevel=compression_level) if path2 is not None: file2 = xopen(path2, 'wb', compresslevel=compression_level) return file1, file2
def _random_generate(self): """ Generate sorted random number tables """ for chrom, no_of_reads in self.no_of_reads_by_chromosome: with xopen(self.out_prefix + '.{0}.bootstrap_{1:02d}.randtable.gz'.format(chrom, 0), 'wb') as fw: for line_id, read_id in enumerate(self.all_line_id_to_read_id[chrom]): fw.write("{0}\t{1}\n".format(line_id, 1).encode('utf-8')) random.seed(a=RANDOM_SEED, version=2) bootstrap_seeds = [] for i in range(self.no_of_bootstraps): bootstrap_seeds.append(random.randrange(self.total_no_of_processed_reads)) for i in range(self.no_of_bootstraps): logging.debug( "[{0}] Generating random number tables for bootstrap {0}, BAM file {1}".format(time.ctime(), i + 1, self.in_bam)) random.seed(a=bootstrap_seeds[i], version=2) count_table = array.array('l', [0] * self.total_no_of_processed_reads) for n in range(self.total_no_of_processed_reads): count_table[random.randrange(self.total_no_of_processed_reads)] += 1 for chrom, no_of_reads in self.no_of_reads_by_chromosome: with xopen(self.out_prefix + '.{0}.bootstrap_{1:02d}.randtable.gz'.format(chrom, i + 1), mode='wb') as fw: for line_id, read_id in enumerate(self.all_line_id_to_read_id[chrom]): if count_table[read_id]: fw.write("{0}\t{1}\n".format(line_id, count_table[read_id]).encode('utf-8'))
def concat(self): for concat_covrss, split_covrss_list in self.concat_dict.items(): with xopen(concat_covrss, 'wb', compresslevel=9) as fw: for split_covrss in split_covrss_list: with xopen(split_covrss, 'rb') as f: for line in f: fw.write(line)
def run(self): if self.stdin_fd != -1: sys.stdin.close() sys.stdin = os.fdopen(self.stdin_fd) try: with xopen(self.file, 'rb') as f: if self.file2: with xopen(self.file2, 'rb') as f2: for chunk_index, (chunk1, chunk2) in enumerate( dnaio.read_paired_chunks( f, f2, self.buffer_size)): self.send_to_worker(chunk_index, chunk1, chunk2) else: for chunk_index, chunk in enumerate( dnaio.read_chunks(f, self.buffer_size)): self.send_to_worker(chunk_index, chunk) # Send poison pills to all workers for _ in range(len(self.connections)): worker_index = self.queue.get() self.connections[worker_index].send(-1) except Exception as e: # TODO better send this to a common "something went wrong" Queue for connection in self.connections: connection.send(-2) connection.send((e, traceback.format_exc()))
def test_append(): cases = ["", ".gz"] if bz2 and sys.version_info > (3,): # BZ2 does NOT support append in Py 2. cases.append(".bz2") if lzma: cases.append(".xz") for ext in cases: # On Py3, need to send BYTES, not unicode. Let's do it for all. text = "AB".encode("utf-8") reference = text + text with temporary_path('truncated.fastq' + ext) as path: try: os.unlink(path) except OSError: pass with xopen(path, 'ab') as f: f.write(text) with xopen(path, 'ab') as f: f.write(text) with xopen(path, 'r') as f: for appended in f: pass try: reference = reference.decode("utf-8") except AttributeError: pass assert appended == reference
def fq_merge(self, fout, qlist): """ Compress, multiple fastq files into single file """ with xopen(fout, 'wb') as w: for q in qlist: with xopen(q, 'rb') as r: shutil.copyfileobj(r, w)
def compress_output(self, f_in): """ Compress f_in, save to self.outdir """ f_out = os.path.join(self.outdir, os.path.basename(f_in) + '.gz') log.info('Saving file: {}'.format(f_out)) # pigz faster than gzip with xopen(f_in, 'rb') as r: with xopen(f_out, 'wb') as w: shutil.copyfileobj(r, w)
def run(self, processes=8): with multiprocessing.Pool(processes) as pool: multiple_results = [pool.apply_async(i.run, args=()) for i in self.precompute_thread_list] [res.get() for res in multiple_results] for concat_fsrtsv, split_fsrtsv_list in self.concat_dict.items(): with xopen(concat_fsrtsv, 'wb', compresslevel=9) as fw: for split_fsrtsv in split_fsrtsv_list: with xopen(split_fsrtsv, 'rb') as f: for line in f: fw.write(line) os.remove(split_fsrtsv)
def run_pe_bc(self): # step1. index1 outdir1 = os.path.join(self.outdir, '_tmp') with xopen(self.fq1, 'rt') as r1, xopen(self.fq2, 'rt') as r2: self.barcode_pe(r1, r2, outdir1) # rename files in dirs/ flist1 = self.wrap_dir(outdir1, mode='barcode') # save files self.wrap_read_count() self.wrap_file()
def test_append_text(ext, tmp_path): text = "AB" reference = text + text path = tmp_path / f"the-file{ext}" with xopen(path, "at") as f: f.write(text) with xopen(path, "at") as f: f.write(text) with xopen(path, "rt") as f: for appended in f: pass assert appended == reference
def test_append_text(ext, tmpdir): text = "AB" reference = text + text path = str(tmpdir.join("the-file" + ext)) with xopen(path, "at") as f: f.write(text) with xopen(path, "at") as f: f.write(text) with xopen(path, "rt") as f: for appended in f: pass assert appended == reference
def test_write_with_xopen(tmp_path, fileformat, extension): s = dnaio.SequenceRecord('name', 'ACGT', 'HHHH') out_fastq = tmp_path / ("out." + fileformat + extension) with xopen(out_fastq, 'wb') as outer_f: with dnaio.open(outer_f, mode='w', fileformat=fileformat) as f: f.write(s) with xopen(out_fastq) as f: if fileformat == "fasta": assert f.read() == ">name\nACGT\n" else: assert f.read() == "@name\nACGT\n+\nHHHH\n"
def write_fastq_multi(fastq_list, outputfile, compressed=True): if compressed: with xopen(outputfile + ".1.fastq.gz", "ab") as f1: with xopen(outputfile + ".2.fastq.gz", "ab") as f2: for read in fastq_list: f1.write(read[0].encode()) f2.write(read[1].encode()) else: with open(outputfile + ".1.fastq", "a") as f1: with open(outputfile + ".2.fastq", "a") as f2: for read in fastq_list: f1.write(read[0]) f2.write(read[1])
def test_append(ext, tmpdir): text = b"AB" reference = text + text path = str(tmpdir.join("the-file" + ext)) with xopen(path, "ab") as f: f.write(text) with xopen(path, "ab") as f: f.write(text) with xopen(path, "r") as f: for appended in f: pass reference = reference.decode("utf-8") assert appended == reference
def test_append(ext, tmp_path): text = b"AB" reference = text + text path = tmp_path / f"the-file{ext}" with xopen(path, "ab") as f: f.write(text) with xopen(path, "ab") as f: f.write(text) with xopen(path, "r") as f: for appended in f: pass reference = reference.decode("utf-8") assert appended == reference
def reader_process(file, file2, connections, queue, buffer_size, stdin_fd): """ Read chunks of FASTA or FASTQ data from *file* and send to a worker. queue -- a Queue of worker indices. A worker writes its own index into this queue to notify the reader that it is ready to receive more data. connections -- a list of Connection objects, one for each worker. The function repeatedly - reads a chunk from the file - reads a worker index from the Queue - sends the chunk to connections[index] and finally sends "poison pills" (the value -1) to all connections. """ if stdin_fd != -1: sys.stdin.close() sys.stdin = os.fdopen(stdin_fd) try: with xopen(file, 'rb') as f: if file2: with xopen(file2, 'rb') as f2: for chunk_index, (chunk1, chunk2) in enumerate( dnaio.read_paired_chunks(f, f2, buffer_size)): # Determine the worker that should get this chunk worker_index = queue.get() pipe = connections[worker_index] pipe.send(chunk_index) pipe.send_bytes(chunk1) pipe.send_bytes(chunk2) else: for chunk_index, chunk in enumerate( dnaio.read_chunks(f, buffer_size)): # Determine the worker that should get this chunk worker_index = queue.get() pipe = connections[worker_index] pipe.send(chunk_index) pipe.send_bytes(chunk) # Send poison pills to all workers for _ in range(len(connections)): worker_index = queue.get() connections[worker_index].send(-1) except Exception as e: # TODO better send this to a common "something went wrong" Queue for worker_index in range(len(connections)): connections[worker_index].send(-2) connections[worker_index].send((e, traceback.format_exc()))
def import_contigs(contigs_path): """Import raw contigs.""" contigs = [] # with contigs_path.open() as fh: with xopen(str(contigs_path), threads=0) as fh: for record in SeqIO.parse(fh, 'fasta'): seq = str(record.seq).upper() if (FASTA_DNA_SEQUENCE_PATTERN.fullmatch(seq) is None): log.error( 'import: Fasta sequence contains invalid DNA characters! id=%s' ) raise ValueError( f'Fasta sequence contains invalid DNA characters! id={record.id}' ) contig = { 'id': record.id, 'description': record.description, 'sequence': seq, 'length': len(seq), 'complete': False, 'type': bc.REPLICON_CONTIG, 'topology': bc.TOPOLOGY_LINEAR } log.info( 'imported: id=%s, length=%i, complete=%s, topology=%s, description=%s', contig['id'], contig['length'], contig['complete'], contig['topology'], contig['description']) contigs.append(contig) return contigs
def test_has_iter_method(ext, tmp_path): path = tmp_path / f"out{ext}" with xopen(path, mode="w") as f: # Writing anything isn’t strictly necessary, but if we don’t, then # pbzip2 causes a delay of one second f.write("hello") assert hasattr(f, "__iter__")
def check_haplotag_list_information(haplotag_list, exit_stack): """ Check if the haplotag list file has at least 4 columns (assumed to be read name, haplotype, phaseset, chromosome), or at least 2 columns (as above). Fails if the haplotag file is not tab-separated. Return suitable parser for format :param haplotag_list: Tab-separated file with at least 2 or 4 columns :param exit_stack: :return: """ haplo_list = exit_stack.enter_context(xopen(haplotag_list)) first_line = haplo_list.readline().strip() # rewind to make sure a header-less file is processed correctly haplo_list.seek(0) has_chrom_info = False try: _, _, _, _ = first_line.split("\t")[:4] line_parser = _four_column_parser except ValueError: try: _, _ = first_line.split("\t")[:2] line_parser = _two_column_parser except ValueError: raise ValueError( "First line of haplotag list file does not have " "at least 2 columns, or it is not tab-separated: {}".format( first_line)) else: has_chrom_info = True return haplo_list, has_chrom_info, line_parser
def __init__(self, file): if isinstance(file, str): self._file = xopen(file, 'w') self._close_on_exit = True else: self._file = file self._close_on_exit = False
def test_readinto(fname): content = CONTENT.encode("utf-8") with xopen(fname, "rb") as f: b = bytearray(len(content) + 100) length = f.readinto(b) assert length == len(content) assert b[:length] == content
def identify(input_files: Tuple, output: os.PathLike = "duplicates.json"): """ Identifies fragments with duplicated sequences. Merges the hashed dictionaries (in json format) generated by the "parse" subcommand and identifies read with exactly the same sequence (share an identical hash). Duplicated read identifiers (hashed) are output in json format. The "remove" subcommand uses this dictionary to remove duplicates from fastq files. \f Args: input_files (Tuple): Paths to json files containing dictionaries with hashed read ids as the keys and hashed sequences as the values. output (os.PathLike, optional): Duplicate read ids identified. Defaults to "duplicates.json". """ dedup_sequences = dict() read_ids = set() np.random.shuffle(np.array(input_files)) for fn in input_files: d = load_json(fn) # {READ_NAME_HASH: SEQUENCE_HASH} read_ids.update(d) dedup_sequences.update( invert_dict(d)) # {SEQUENCE_HASH: READ_NAME_HASH} duplicated_ids = read_ids - set(dedup_sequences.values()) del read_ids del dedup_sequences with xopen(output, "w") as w: duplicated_ids_dict = dict.fromkeys(duplicated_ids) ujson.dump(duplicated_ids_dict, w)
def align_query_genome(config, dna_fragments_path, dna_fragments, ref_genome_id): """Perform per-genome calculation of ANI/conserved DNA values. :param config: a global config object encapsulating global runtime vars :param dna_fragments: A dict comprising information on fragments. :param ref_genome_id: reference genome id. :rtype: A dict representing a reference genome and additionally comprising ANI / conserved DNA values. """ tmp_dir = Path(tempfile.mkdtemp()) reference_genome_zipped_path = config['db_path'].joinpath( f'{ref_genome_id}.fna.gz') reference_genome_path = tmp_dir.joinpath(f'{ref_genome_id}.fna') with reference_genome_path.open(mode='w') as fh_out, xopen( str(reference_genome_zipped_path), threads=0) as fh_in: for line in fh_in: fh_out.write(line) dna_fragment_matches = execute_nucmer(config, tmp_dir, dna_fragments, dna_fragments_path, reference_genome_path) shutil.rmtree(str(tmp_dir)) ani = calculate_ani(dna_fragment_matches) conserved_dna = calculate_conserved_dna(dna_fragments, dna_fragment_matches) return (ref_genome_id, ani, conserved_dna)
def test_override_output_format(tmp_path): path = tmp_path / "test_gzip_compressed" with xopen(path, mode="wb", format="gz") as f: f.write(b"test") test_contents = path.read_bytes() assert test_contents.startswith(b"\x1f\x8b") # Gzip magic assert gzip.decompress(test_contents) == b"test"
def test_truncated_iter(extension, create_truncated_file): truncated_file = create_truncated_file(extension) with pytest.raises((EOFError, IOError)): f = xopen(truncated_file, "r") for line in f: pass f.close() # pragma: no cover
def test_truncated_gz(): with temporary_path('truncated.gz') as path: create_truncated_file(path) with timeout(seconds=2): f = xopen(path, 'r') f.read() f.close()
def make_random_fasta(path, n_records): from random import choice with xopen(path, "w") as f: for i in range(n_records): name = "sequence_{}".format(i) sequence = "".join(choice("ACGT") for _ in range(300)) print(">", name, "\n", sequence, sep="", file=f)
def reader_process(file, file2, connections, queue, buffer_size, stdin_fd): """ Read chunks of FASTA or FASTQ data from *file* and send to a worker. queue -- a Queue of worker indices. A worker writes its own index into this queue to notify the reader that it is ready to receive more data. connections -- a list of Connection objects, one for each worker. The function repeatedly - reads a chunk from the file - reads a worker index from the Queue - sends the chunk to connections[index] and finally sends "poison pills" (the value -1) to all connections. """ if stdin_fd != -1: sys.stdin.close() sys.stdin = os.fdopen(stdin_fd) try: with xopen(file, 'rb') as f: if file2: with xopen(file2, 'rb') as f2: for chunk_index, (chunk1, chunk2) in enumerate(dnaio.read_paired_chunks(f, f2, buffer_size)): # Determine the worker that should get this chunk worker_index = queue.get() pipe = connections[worker_index] pipe.send(chunk_index) pipe.send_bytes(chunk1) pipe.send_bytes(chunk2) else: for chunk_index, chunk in enumerate(dnaio.read_chunks(f, buffer_size)): # Determine the worker that should get this chunk worker_index = queue.get() pipe = connections[worker_index] pipe.send(chunk_index) pipe.send_bytes(chunk) # Send poison pills to all workers for _ in range(len(connections)): worker_index = queue.get() connections[worker_index].send(-1) except Exception as e: # TODO better send this to a common "something went wrong" Queue for worker_index in range(len(connections)): connections[worker_index].send(-2) connections[worker_index].send((e, traceback.format_exc()))
def __init__(self, file): """ file is a path or a file-like object. In both cases, the file may be compressed (.gz, .bz2, .xz). """ if isinstance(file, basestring): file = xopen(file) self._close_on_exit = True self._file = file
def __init__(self, file, colorspace=False, skip_color=0): """ file is a filename or a file-like object. If file is a filename, then .gz files are supported. colorspace -- Usually (when this is False), there must be n characters in the sequence and n quality values. When this is True, there must be n+1 characters in the sequence and n quality values. """ if isinstance(file, basestring): file = xopen(file, "r") self.fp = file self.colorspace = colorspace self.skip_color = skip_color self.twoheaders = False
def __init__(self, file, wholefile=False, keep_linebreaks=False): """ file is a filename or a file-like object. If file is a filename, then .gz files are supported. If wholefile is True, then it is ok to read the entire file into memory. This is faster when there are many newlines in the file, but may obviously need a lot of memory. keep_linebreaks -- whether to keep the newline characters in the sequence """ if isinstance(file, basestring): file = xopen(file, "r") self.fp = file self.wholefile = wholefile self.keep_linebreaks = keep_linebreaks assert not (wholefile and keep_linebreaks), "not supported"
def parse(fname): """Parse multi fasta records file and return a Fasta Object iterator""" name = '' seq = [] handle = xopen(fname, 'r') for line in handle: line = line.strip() if not line: continue if line.startswith('>'): if name or seq: yield Fasta(name, ''.join(seq)) name = line[1:] seq = [] else: seq.append(line) if name or seq: yield Fasta(name, ''.join(seq))
def parse(qseqfile, fmt="I"): fmt = fmt.upper() handle = xopen(qseqfile, "rb") table_64_to_33 = phred64to33() for line in handle: line = line.strip() if not line: continue (mach, runid, lane, tile, x, y, index, readid, seq, qual, fil) = line.split("\t") # if fil value is 1 pass filter, 0 not fil = "N" if fil == "1" else "Y" if fmt in PHRED64_FORMAT: # trans phred64 quality to phred33 quality qual = qual.translate(table_64_to_33) name = "{0}:{1}:{2}:{3}:{4}:{5} {6}:{7}:{8}".format(mach, runid, lane, tile, x, y, readid, fil, index) yield Fastq(name, seq, qual)
def parse(fname, qtype='S'): """parse fastq file and return a iterator standard is a mark to show whether format to trans to standard """ seq = '' qual = '' name = '' slen = qlen = 0 if qtype in PHRED64_TYPE: need_trans = True trans = phred64to33 else: need_trans = False is_seq_block = False # True as Seq block, False Qual block handle = xopen(fname, 'r') # read head lines to check is or not fastq file for line in handle: line = line.rstrip() if not line or line.startswith('#'): continue if not line.startswith('@'): raise ValueError('{0} is not in fastq format'.format(fname)) break # quit cycle # check is a empty fastq file or not if not line: return is_seq_block = True name = line[1:] for line in handle: line = line.rstrip() # trim right endof \n \r if not line: # ignore blank line continue if is_seq_block: # deal with seq block if line.startswith('+'): # next is qual block is_seq_block = False else: # deal with seq block seq += line slen += len(line) else: # deal with quality block if qlen > slen: # check qual length <= seq length raise ValueError('Error while Parsing {0}'.format(name)) if line.startswith('@'): # switch to sequence block # at beginning of next fastq if seq and slen == qlen: if need_trans: qual = trans(qual) yield Fastq(name, seq, qual) seq = '' qual = '' name = line[1:] is_seq_block = True # next is seq block elif not seq and not qual: # start to generate fastq name = line[1:] is_seq_block = True # next is seq block else: # just a qual line begin with @ qual += line # renew quality value qlen += len(line) else: qual += line qlen += len(line) # yield last fastq record if name or seq: if slen != qlen: # check the last fastq record raise ValueError('parsing wrong with {0}'.format(name)) if need_trans: # trans qual qual = trans(qual) yield Fastq(name, seq, qual)
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) # Setup logging only if there are not already any handlers (can happen when # this function is being called externally such as from unit tests) if not logging.root.handlers: setup_logging(stdout=bool(options.output), quiet=options.quiet) if len(args) == 0: parser.error("At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] if input_filename.endswith('.qual'): parser.error("If a .qual file is given, it must be the second argument.") # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' # Any of these options switch off legacy mode if (options.adapters2 or options.front2 or options.anywhere2 or options.cut2 or options.interleaved or options.pair_filter or options.too_short_paired_output or options.too_long_paired_output): # Full paired-end trimming when both -p and -A/-G/-B/-U given # Read modifications (such as quality trimming) are applied also to second read. paired = 'both' if paired and len(args) == 1 and not options.interleaved: parser.error("When paired-end trimming is enabled via -A/-G/-B/-U/" "--interleaved or -p, two input files are required.") if not paired: if options.untrimmed_paired_output: parser.error("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") interleaved_input = False interleaved_output = False if options.interleaved: interleaved_input = len(args) == 1 interleaved_output = not options.paired_output if not interleaved_input and not interleaved_output: parser.error("When --interleaved is used, you cannot provide both two input files and two output files") # Assign input_paired_filename and quality_filename input_paired_filename = None quality_filename = None if paired: if not interleaved_input: input_paired_filename = args[1] if not interleaved_output: if not options.paired_output: parser.error("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if not options.output: parser.error("When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): parser.error("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") if options.too_short_output and not options.too_short_paired_output: parser.error("When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: parser.error("When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif len(args) == 2: quality_filename = args[1] if options.format is not None: parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.") if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: parser.error("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # Open input file(s) try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format, interleaved=interleaved_input) except (seqio.UnknownFileType, IOError) as e: parser.error(e) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error("Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error("Quality cutoff value not recognized: {0}".format(e)) else: parser.error("Expected one value or two values separated by comma for the quality cutoff") else: cutoffs = None open_writer = functools.partial(seqio.open, mode='w', qualities=reader.delivers_qualities, colorspace=options.colorspace) if options.pair_filter is None: options.pair_filter = 'any' min_affected = 2 if options.pair_filter == 'both' else 1 if not paired: filter_wrapper = Redirector elif paired == 'first': filter_wrapper = LegacyPairedRedirector elif paired == 'both': filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected) filters = [] # TODO open_files = [] too_short_writer = None # too short reads go here # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output) filters.append(filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length))) too_long_writer = None # too long reads go here if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output) filters.append(filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length))) if options.max_n != -1: filters.append(filter_wrapper(None, NContentFilter(options.max_n))) if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(options.untrimmed_output is not None) > 1: parser.error("Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time.") demultiplexer = None untrimmed_writer = None writer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error("Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.replace('{name}', 'unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed, qualities=reader.delivers_qualities, colorspace=options.colorspace) filters.append(demultiplexer) else: # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive in order to avoid brain damage. if options.discard_trimmed: filters.append(filter_wrapper(None, DiscardTrimmedFilter())) elif options.discard_untrimmed: filters.append(filter_wrapper(None, DiscardUntrimmedFilter())) elif options.untrimmed_output: untrimmed_writer = open_writer(options.untrimmed_output, options.untrimmed_paired_output) filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter())) # Finally, figure out where the reads that passed all the previous # filters should go. if options.output is not None: writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output) else: writer = open_writer(default_outfile, interleaved=interleaved_output) if not paired: filters.append(NoFilter(writer)) else: filters.append(PairedNoFilter(writer)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise except ValueError as e: parser.error(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the single-end processing pipeline (a list of "modifiers") modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.nextseq_trim is not None: modifiers.append(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if cutoffs: modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) # Modifiers that apply to both reads of paired-end reads unless in legacy mode modifiers_both = [] if options.length is not None: modifiers_both.append(Shortener(options.length)) if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) if paired: pipeline = PairedEndPipeline(reader, modifiers, modifiers2, filters) else: pipeline = SingleEndPipeline(reader, modifiers, filters) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info("Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) if paired == 'first' and (modifiers_both or cutoffs): logger.warning('\n'.join(textwrap.wrap('WARNING: Requested read ' 'modifications are applied only to the first ' 'read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U options. ' 'Use a dummy adapter sequence when necessary: -A XXX'))) start_time = time.clock() try: stats = pipeline.run() except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [writer, untrimmed_writer, options.rest_file, options.wildcard_file, options.info_file, too_short_writer, too_long_writer, options.info_file, demultiplexer]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, filters) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
from xopen import xopen import glob import json import pandas as pd import numpy files = glob.glob("colorsinart_/2018*xz") #load all files in directory to be parsed through the xopen read content = "" counter = 1 list_item = [] list_header = ['description', 'date_posted', 'likes', 'comments', 'post_id', 'username', 'is_connected_fb', 'is_video'] for file in files: with xopen(file) as f: if counter == 1: #identify first record content = content + "[" + str(f.read()) + ", \n" counter += 1 elif counter == len(files): #identify last record content = content + str(f.read()) + "]" counter += 1 else: content = content + str(f.read()) + ", \n" counter += 1 #convert string to listed dict format data = json.loads(content) #extract the required metric from the json data for data in data: list_item += [(data['node']['edge_media_to_caption']['edges'][0]['node']['text'], datetime.datetime.fromtimestamp(data['node']['taken_at_timestamp']).strftime('%Y-%m-%d %H:%M:%S'), #transform the date format from ms
def main(cmdlineargs=None): """Main function that evaluates command-line parameters and contains the main loop over all reads.""" parser = HelpfulOptionParser(usage=__doc__, version=__version__) parser.add_option("-f", "--format", default=None, help="Input file format; can be either 'fasta', 'fastq' or 'sra-fastq'. " "Ignored when reading csfasta/qual files (default: auto-detect from file name extension).") group = OptionGroup(parser, "Options that influence how the adapters are found", description="Each of the following three parameters (-a, -b, -g) can be used " +\ "multiple times and in any combination to search for an entire set of " + \ "adapters of possibly different types. All of the "+\ "given adapters will be searched for in each read, but only the best "+\ "matching one will be trimmed (but see the --times option).") group.add_option("-a", "--adapter", action="append", metavar="ADAPTER", dest="adapters", default=[], help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed.") group.add_option("-b", "--anywhere", action="append", metavar="ADAPTER", default=[], help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept.") group.add_option("-g", "--front", action="append", metavar="ADAPTER", default=[], help="Sequence of an adapter that was ligated to the 5' end. If the " + \ "adapter sequence starts with the character '^', the adapter is " + \ "'anchored'. An anchored adapter must appear in its entirety at the " + \ "5' end of the read (it is a prefix of the read). A non-anchored adapter may " + \ "appear partially at the 5' end, or it may occur within the read. If it is " + \ "found within a read, the sequence preceding the adapter is also trimmed. " + \ "In all cases the adapter itself is trimmed.") group.add_option("-e", "--error-rate", type=float, default=0.1, help="Maximum allowed error rate (no. of errors divided by the length of the matching region) (default: %default)") group.add_option("-n", "--times", type=int, metavar="COUNT", default=1, help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times (default: %default).") group.add_option("-O", "--overlap", type=int, metavar="LENGTH", default=3, help="Minimum overlap length. If the overlap between the read and the adapter is shorter than LENGTH, the read is not modified." "This reduces the no. of bases trimmed purely due to short random adapter matches (default: %default).") group.add_option("--match-read-wildcards", action="store_true", default=False, help="Allow 'N's in the read as matches to the adapter (default: %default).") group.add_option("-N", "--no-match-adapter-wildcards", action="store_false", default=True, dest='match_adapter_wildcards', help="Do not treat 'N' in the adapter sequence as wildcards. This is needed when you want to search for literal 'N' characters.") parser.add_option_group(group) group = OptionGroup(parser, "Options for filtering of processed reads") group.add_option("--discard-trimmed", "--discard", action='store_true', default=False, help="Discard reads that contain the adapter instead of trimming them. Also use -O in order to avoid throwing away too many randomly matching reads!") group.add_option("-m", "--minimum-length", type=int, default=0, metavar="LENGTH", help="Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted (default: 0).") group.add_option("-M", "--maximum-length", type=int, default=sys.maxsize, metavar="LENGTH", help="Discard trimmed reads that are longer than LENGTH. " "Reads that are too long even before adapter removal " "are also discarded. In colorspace, an initial primer " "is not counted (default: no limit).") parser.add_option_group(group) group = OptionGroup(parser, "Options that influence what gets output to where") group.add_option("-o", "--output", default=None, metavar="FILE", help="Write the modified sequences to this file instead of standard output and send the summary report to standard output. " "The format is FASTQ if qualities are available, FASTA otherwise. (default: standard output)") group.add_option("-r", "--rest-file", default=None, metavar="FILE", help="When the adapter matches in the middle of a read, write the rest (after the adapter) into a file. Use - for standard output.") group.add_option("--wildcard-file", default=None, metavar="FILE", help="When the adapter has wildcard bases ('N's) write adapter bases matching wildcard " "positions to FILE. Use - for standard output.") group.add_option("--too-short-output", default=None, metavar="FILE", help="Write reads that are too short (according to length specified by -m) to FILE. (default: discard reads)") group.add_option("--untrimmed-output", default=None, metavar="FILE", help="Write reads that do not contain the adapter to FILE, instead " "of writing them to the regular output file. (default: output " "to same file as trimmed)") parser.add_option_group(group) group = OptionGroup(parser, "Additional modifications to the reads") group.add_option("-q", "--quality-cutoff", type=int, default=None, metavar="CUTOFF", help="Trim low-quality ends from reads before adapter removal. " "The algorithm is the same as the one used by BWA " "(Subtract CUTOFF from all qualities; " "compute partial sums from all indices to the end of the " "sequence; cut sequence at the index at which the sum " "is minimal) (default: %default)") group.add_option("--quality-base", type=int, default=33, help="Assume that quality values are encoded as ascii(quality + QUALITY_BASE). The default (33) is usually correct, " "except for reads produced by some versions of the Illumina pipeline, where this should be set to 64. (default: %default)") group.add_option("-x", "--prefix", default='', help="Add this prefix to read names") group.add_option("-y", "--suffix", default='', help="Add this suffix to read names") group.add_option("-c", "--colorspace", action='store_true', default=False, help="Colorspace mode: Also trim the color that is adjacent to the found adapter.") group.add_option("-d", "--double-encode", action='store_true', default=False, help="When in color space, double-encode colors (map 0,1,2,3,4 to A,C,G,T,N).") group.add_option("-t", "--trim-primer", action='store_true', default=False, help="When in color space, trim primer base and the first color " "(which is the transition to the first nucleotide)") group.add_option("--strip-f3", action='store_true', default=False, help="For color space: Strip the _F3 suffix of read names") group.add_option("--maq", "--bwa", action='store_true', default=False, help="MAQ- and BWA-compatible color space output. This enables -c, -d, -t, --strip-f3, -y '/1' and -z.") group.add_option("--length-tag", default=None, metavar="TAG", help="Search for TAG followed by a decimal number in the name of the read " "(description/comment field of the FASTA or FASTQ file). Replace the " "decimal number with the correct length of the trimmed read. " "For example, use --length-tag 'length=' to search for fields " "like 'length=123'.") group.add_option("--zero-cap", "-z", action='store_true', default=False, help="Change negative quality values to zero (workaround to avoid segmentation faults in BWA)") parser.add_option_group(group) options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error("At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] quality_filename = None if len(args) == 2: quality_filename = args[1] if input_filename.endswith('.qual') and quality_filename.endswith('fasta'): parser.error("FASTA and QUAL file given, but the FASTA file must be first.") if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: parser.error("The input file format must be either 'fasta', 'fastq' or 'sra-fastq' (not '{0}').".format(options.format)) # TODO should this really be an error? if options.format is not None and quality_filename is not None: parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.") # default output files (overwritten below) trimmed_outfile = sys.stdout # reads with adapters go here too_short_outfile = None # too short reads go here #too_long_outfile = None # too long reads go here if options.output is not None: trimmed_outfile = xopen(options.output, 'w') untrimmed_outfile = trimmed_outfile # reads without adapters go here if options.untrimmed_output is not None: untrimmed_outfile = xopen(options.untrimmed_output, 'w') if options.too_short_output is not None: too_short_outfile = xopen(options.too_short_output, 'w') #if options.too_long_output is not None: #too_long_outfile = xopen(options.too_long_output, 'w') if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_f3 = True options.suffix = "/1" options.zero_cap = True if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in color space.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in color space.") if options.colorspace and options.front and not options.trim_primer: parser.error("Currently, when you want to trim a 5' adapter in colorspace, you must also specify the --trim-primer option") if options.anywhere and options.colorspace: parser.error("Using --anywhere with color space reads is currently not supported (if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') adapters = [] def append_adapters(adapter_list, where): for seq in adapter_list: seq = seq.strip() w = where if w == FRONT and seq.startswith('^'): seq = seq[1:] w = PREFIX adapters.append(Adapter(seq, w, options.error_rate, options.overlap, options.match_read_wildcards, options.colorspace, options.match_adapter_wildcards, options.wildcard_file, options.rest_file)) append_adapters(options.adapters, BACK) append_adapters(options.anywhere, ANYWHERE) append_adapters(options.front, FRONT) # make sure these aren't used by accident del options.adapters del options.anywhere del options.front if not adapters and options.quality_cutoff is None: print("You need to provide at least one adapter sequence.", file=sys.stderr) return 1 #total_bases = 0 #total_quality_trimmed = 0 modifiers = [] if options.length_tag: modifiers.append(LengthTagModifier(options.length_tag)) if options.strip_f3: modifiers.append(SuffixRemover('_F3')) if options.prefix or options.suffix: modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers.append(DoubleEncoder()) if options.zero_cap: modifiers.append(ZeroCapper(quality_base=options.quality_base)) cutter = AdapterCutter(adapters, options.times, options.rest_file, options.colorspace, options.wildcard_file) readfilter = ReadFilter(options.minimum_length, options.maximum_length, too_short_outfile, options.discard_trimmed, cutter.stats) # TODO stats? try: twoheaders = None reader = read_sequences(input_filename, quality_filename, colorspace=options.colorspace, fileformat=options.format) for read in reader: # In colorspace, the first character is the last nucleotide of the primer base # and the second character encodes the transition from the primer base to the # first real base of the read. if options.trim_primer: read.sequence = read.sequence[2:] if read.qualities is not None: # TODO read.qualities = read.qualities[1:] initial = '' elif options.colorspace: initial = read.sequence[0] read.sequence = read.sequence[1:] else: initial = '' #total_bases += len(qualities) if options.quality_cutoff is not None: index = quality_trim_index(read.qualities, options.quality_cutoff, options.quality_base) read = read[:index] read, trimmed = cutter.cut(read) for modifier in modifiers: read = modifier.apply(read) if twoheaders is None: try: twoheaders = reader.twoheaders except AttributeError: twoheaders = False if readfilter.keep(read, trimmed): read.sequence = initial + read.sequence try: write_read(read, trimmed_outfile if trimmed else untrimmed_outfile, twoheaders) except IOError as e: if e.errno == errno.EPIPE: return 1 raise except seqio.FormatError as e: print("Error:", e, file=sys.stderr) return 1 if options.rest_file is not None: options.rest_file.close() if options.wildcard_file is not None: options.wildcard_file.close() # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None cutter.stats.print_statistics(options.error_rate, file=stat_file) return 0