def get_max_reads_length(reads_file, log, num_checked): file_type = SeqIO.get_read_file_type(reads_file) if not file_type: error('Incorrect extension of reads file: ' + reads_file, log) max_reads_length = max([len(rec) for rec in itertools.islice(SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked)]) log.info(reads_file + ': max reads length: ' + str(max_reads_length)) return max_reads_length
def collect_contigs(dataset, barcodes_dir, output_base, format): output = open(output_base + "." + format, "w") for barcode in dataset: file = os.path.join(barcodes_dir, barcode.id, "truseq_long_reads." + format) if os.path.exists(file): contigs = SeqIO.parse(open(file), format) for contig in contigs: contig.id = barcode.id + "-" + contig.id SeqIO.write(contig, output, format) output.close()
def check_file_not_empty(input_filename, message="", log=None): filename = abspath(expanduser(input_filename)) file_type = get_read_file_type(input_filename, log) if (file_type == 'bam'): return try: reads_iterator = SeqIO.parse(SeqIO.Open(filename, "r"), file_type) if next(reads_iterator, None) is None: error("file is empty: %s (%s)" % (filename, message), log=log) except Exception as inst: error(inst.args[0].format(FILE=filename) + "\n\n" + traceback.format_exc().format(FILE=filename), log=log)
def get_max_reads_length(reads_file, log, num_checked): if reads_file in options_storage.dict_of_prefixes: ext = options_storage.dict_of_prefixes[reads_file] file_type = SeqIO.get_read_file_type(ext) else: file_type = SeqIO.get_read_file_type(reads_file) if not file_type: error("incorrect extension of reads file: %s" % reads_file, log) max_reads_length = max([ len(rec) for rec in itertools.islice( SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked) ]) log.info("%s: max reads length: %s" % (reads_file, str(max_reads_length))) return max_reads_length
def moleculo_postprocessing(contigs_file, output_file, sam_files, log): log.info("===== Starting postprocessing based on read alignment") log.info("Processing scaffolds from " + contigs_file) log.info("Using read alignments to break and filter scaffolds") contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta")) sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files]) generate_quality.GenerateQuality(contigs, sam) pattern_filter = moleculo_filter_contigs.PatternContigFilter(contigs, sam, pattern, rc_pattern) length_filter = moleculo_filter_contigs.ContigLengthFilter(1500) coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50) pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150) n_breaker = break_by_coverage.NBreaker(3) result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter) OutputResults(output_file, "fasta", result) OutputResults(output_file, "fastq", result) log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")
def get_max_reads_length(reads_file, log, num_checked): file_type = get_read_file_type(reads_file, log) max_reads_length = 0 try: max_reads_length = max([ len(rec) for rec in itertools.islice( SeqIO.parse(SeqIO.Open(reads_file, "r"), file_type), num_checked) ]) except Exception as inst: error(inst.args[0].format(FILE=reads_file) + "\n\n" + traceback.format_exc().format(FILE=reads_file), log=log) else: log.info("%s: max reads length: %s" % (reads_file, str(max_reads_length))) return max_reads_length