def findmitoscaf(args): if args.__calling == 'findmitoscaf': if not args.from_megahit: logger.log(2, 'Remapping reads to contigs since contigs are not assembled from pipeline.') fastfilter_bin = path.abspath(path.join(path.dirname(__file__), 'assemble', 'fastfilter')) filtered_fasta = path.join(args.findmitoscaf_dir, f'{args.workname}.filtered.fa') shell_call(fastfilter_bin, i=args.fastafile, o=filtered_fasta, l=f"{configurations.assemble.min_length},{configurations.assemble.max_length}", d=0) fq1, fq2 = args.fastq1, args.fastq2 if not (fq1 or fq2): raise RuntimeError("At least one fastq file should be specified!") if not fq1: fq1, fq2 = fq2, fq1 # Remapping to calculate average depth. from findmitoscaf.findmitoscaf import remap_sequence args.fastafile = remap_sequence(args.workname, args.findmitoscaf_dir, filtered_fasta, args.fastq1, args.fastq2, args.threads) else: logger.log(2, "Remapping skipped since from-megahit is specified, no tagging needed.") from findmitoscaf.findmitoscaf import findmitoscaf as _findmitoscaf picked_fa = _findmitoscaf( thread_number=args.threads, clade=args.clade, relaxing=args.taxa_tolerance, gene_code=args.genetic_code, multi=args.min_abundance, taxa=args.required_taxa if not args.disable_taxa else None, prefix=args.workname, basedir=args.findmitoscaf_dir, contigs_file=args.fastafile, merge_method=args.merge_method, merge_overlapping=args.merge_overlap, merge_search=args.merge_start) # Further processing for calling directly if args.__calling == 'findmitoscaf': os.rename(picked_fa, path.join( args.result_dir, path.basename(picked_fa))) return picked_fa
def get_rank(taxa_name=None): name_dict = ncbi.get_name_translator([taxa_name]) if taxa_name not in name_dict: # Try to parse the gene name taxa_name = taxa_name.split(' ')[0] name_dict = ncbi.get_name_translator([taxa_name]) rank_dict = { 'kindom': 'NA', 'phylum': 'NA', 'class': 'NA', 'order': 'NA', 'family': 'NA', 'genus': 'NA', 'species': 'NA' } if taxa_name in name_dict: for taxid in ncbi.get_lineage(name_dict[taxa_name][0]): rank = ncbi.get_rank([taxid])[taxid] taxa = ncbi.get_taxid_translator([taxid])[taxid] if rank in rank_dict: rank_dict[rank] = taxa else: logger.log( 2, f'Query name {taxa_name} was skipped because no result found in NCBI database.') return [(tax_class, tax_id) for tax_class, tax_id in rank_dict.items()]
def local(self, current_kmer, next_kmer): logger.log(2, f'Local assembly for k = {current_kmer}') shell_call(self.MEGAHIT_CORE, 'local', c=self._contig_prefix(current_kmer) + '.contigs.fa', l=self.read_lib, t=self.threads, o=self._contig_prefix(current_kmer) + '.local.fa', kmax=next_kmer)
def bim(args): # Also a WIP idea. # MITObim uses MIRA as mapper and assembler, which is clearly outperformed by # bwa alongwith the modified MEGAHIT. If we can reuse current pipeline, then # we surely can make a more powerful MITObim. # raise RuntimeError("This module is still work in progress, in later versions it may be completed.") args.cleanq1 = 'clean.1.fq' args.cleanq2 = 'clean.2.fq' if configurations.filter_rawdata.compress_output_in_all: args.cleanq1 += '.gz' args.cleanq2 += '.gz' if not args.disable_filter: args.fastq1, args.fastq2 = filter(args) from bim.bim import bwa_map, cal_insert from assemble.assemble import assemble fasta_path = path.join(args.temp_dir, f'{args.workname}.bait.fa') shutil.copy(args.fastafile, fasta_path) args.fastafile = fasta_path for i in range(args.max_iteration): logger.log(2, f"Iteration {i} starts.") if len(os.listdir(args.assemble_dir)) != 0: logger.log(2, f"Removing data in previous iteration.") os.system(f"rm -rf {args.assemble_dir}/*") bam, fq1, fq2 = bwa_map(args.threads, args.fastafile, args.assemble_dir, args.workname, args.fastq1, args.fastq2) if args.insert_size_auto: args.insert_size = cal_insert(bam, args.assemble_dir, args.workname) next_generation = assemble( threads=args.threads, base_dir=args.assemble_dir, work_prefix=args.workname, fastq1=fq1, fastq2=fq2, disable_local=args.disable_local, prune_level=args.prune_level, prune_depth=args.prune_depth, keep_temp=args.keep_temp, insert_size=args.insert_size, no_scaf=args.disable_scaffolding or i % (args.scaffolding_spare + 1) != 0, kmer_list=args.kmer_list, depth_list=args.depth_list) if args.iteration_ignore < i: # Criteria of breaking the cycle: # 1. No extension can be made after an iteration. # 2. Genome assembled currently possessed of enough # quality, and passed some tests. args.from_megahit = True filtered_seq = findmitoscaf(args) next_fasta = path.join(args.temp_dir, f'{args.workname}.bait.fa') os.rename(next_generation, next_fasta) args.fastafile = next_fasta
def cal_insert(bam: str, basedir: str, prefix: str) -> int: stat_file = path.join(basedir, prefix + ".stats") log(2, "Measuring insert size of alignments.") stats = [[int(y) for y in x.split("\t")][:2] for x in direct_call(f'\ samtools stats {bam}|\ tee {stat_file}|\ grep ^IS|\ cut -f 2-').split("\n") if x] avg_ins = sum(a * b for a, b in stats) / sum(list(zip(*stats))[1]) log(2, f"Measured insert size is {avg_ins}") return avg_ins
def post(args): if args is None: return if hasattr(args, 'keep_temp') and not args.keep_temp and args.__calling != 'filter' and hasattr(args, 'cleanq1'): # Not removing until here since cleanq1 and cleanq2 have many other usage other than assembling logger.log(1, 'Removing filtered data files.') os.remove(args.cleanq1) if args.fastq2 != None: os.remove(args.cleanq2) logger.log(2, f'All done! Time elapsed : {time.time()-start_time:.2f}s.') logger.finalize()
def fix_circular(fa_file: str): genome = [x for x in SeqIO.parse(fa_file, 'fasta')] if len(genome) != 1: return False info, seq = list(check_circular(final_seqs=genome))[0] if info is not None: logger.log(2, f'An overlapped region was found starting at {info[0]} with length {info[2]}. Trimming it.') seq = seq[info[0]:len(seq) - 500 + info[1]] SeqIO.write([seq], fa_file, 'fasta') return True return False
def iterate(self, current_kmer, next_kmer): logger.log( 2, f'Extracting iterative edges from k = {current_kmer} to {next_kmer}' ) shell_call(self.MEGAHIT_CORE, 'iterate', c=self._contig_prefix(current_kmer) + '.contigs.fa', b=self._contig_prefix(current_kmer) + '.bubble_seq.fa', t=self.threads, s=next_kmer - current_kmer, o=self._graph_prefix(next_kmer), r=self.read_lib + '.bin', k=current_kmer)
def graph(self, current_kmer, next_kmer): options = { 'k': next_kmer, 'host_mem': self.available_memory, 'mem_flag': 1, 'output_prefix': self._graph_prefix(next_kmer), 'num_cpu_threads': self.threads, 'need_mercy': not self.no_mercy and current_kmer == self.kmin, 'kmer_from': current_kmer, 'useconv': False } if current_kmer == 0: # Indicating it's the first graph if not self.one_pass: logger.log(2, f"Extracting solid (k+1)-mers for k={next_kmer}") count_opts = options.copy() count_opts['m'] = self.min_multi count_opts['read_lib_file'] = self.read_lib count_opts.pop('need_mercy') count_opts.pop('kmer_from') logger.log(0, f"Extract options : {count_opts}") shell_call(self.MEGAHIT_CORE, 'count', **count_opts) file_size = 0 if path.exists(self._graph_prefix(next_kmer) + '.edges.0'): options['input_prefix'] = self._graph_prefix(next_kmer) file_size += path.getsize( self._graph_prefix(next_kmer) + '.edges.0') if path.exists(self._contig_prefix(current_kmer) + '.addi.fa'): options['addi_contig'] = \ self._contig_prefix(current_kmer) + '.addi.fa' file_size += path.getsize( self._contig_prefix(current_kmer) + '.addi.fa') if path.exists(self._contig_prefix(current_kmer) + '.local.fa'): options['local_contig'] = \ self._contig_prefix(current_kmer) + '.local.fa' file_size += path.getsize( self._contig_prefix(current_kmer) + '.addi.fa') if path.exists(self._contig_prefix(current_kmer) + '.contigs.fa'): options['contig'] = \ self._contig_prefix(current_kmer) + '.contigs.fa' options['bubble'] = \ self._contig_prefix(current_kmer) + '.bubble_seq.fa' file_size += path.getsize( self._contig_prefix(current_kmer) + '.contigs.fa') if file_size == 0 and current_kmer != 0: raise EmptyGraph logger.log(2, f'Building graph for k={next_kmer}') logger.log(0, f'Build options : {options}') shell_call(self.MEGAHIT_CORE, 'seq2sdbg', **options) if file_size != 0 and current_kmer != 0 and not self.keep_temp: os.system(f"rm -r {path.join(self.temp_dir, f'k{current_kmer}')}")
def assemble(self, kmer) -> Tuple[ContigInfo, ContigInfo]: min_standalone = max( min(self.kmax * 3 - 1, int(self.min_length * 1.5)), self.min_length) options = { 's': self._graph_prefix(kmer), 'o': self._contig_prefix(kmer), 't': self.threads, 'min_standalone': min_standalone, 'prune_level': self.prune_level, 'merge_len': 20, 'merge_similar': 0.95, 'cleaning_rounds': 5, 'disconnect_ratio': 0.1, 'low_local_ratio': 0.2, 'min_depth': self.prune_depth, 'bubble_level': 2, 'max_tip_len': max(1, self.min_length * 1.5 + 1 - kmer) if kmer * 3 - 1 > self.min_length * 1.5 else -1, 'careful_bubble': kmer < self.kmax, 'is_final_round': kmer == self.kmax, 'output_standalone': self.no_local, 'useconv': False } logger.log(2, f'Assembling contigs from SdBG for k = {kmer}') logger.log(0, f'Assemble arguments : {options}') shell_call(self.MEGAHIT_CORE, 'assemble', **options) with open(self._contig_prefix(kmer) + '.contigs.fa.info', 'r') as c, \ open(self._contig_prefix(kmer) + '.addi.fa.info', 'r') as a: return ContigInfo(c), ContigInfo(a)
def scaf(self) -> str: if self.lib_file == None: raise RuntimeError("Lib was not build before scaffolding!") kmer = int(self.read_length / 2) prefix = path.join(self.basedir, f'k{kmer}') # Prepare logger.log(2, "Constructing graph for SOAPdenovo-127.") shell_call(soap_fusion, D=True, s=self.lib_file, p=self.threads, K=kmer, g=prefix, c=self.contigs) # Map logger.log(2, "Mapping sequences.") shell_call(soap_127, 'map', s=self.lib_file, p=self.threads, g=prefix) # Scaff logger.log(2, "Scaffolding.") shell_call(soap_127, 'scaff', p=self.threads, g=prefix) # Convert logger.log(2, "Converting output scaffolds back.") scaf2mega(prefix + '.scafSeq', path.join(path.dirname(self.contigs), 'scaf.fa'), overlay=kmer) return path.join(path.dirname(self.contigs), 'scaf.fa')
def all(args): # Go filtering # # Why I'm NOT using .gz ext here even I have implemented this: # 1. flate2 is slow, it takes much compressing data if single-threaded. # 2. plug in a SSD is much more easier than adding a CPU. # 3. Some method uses only plain text data, so you need an extra (de)compression # but it means nothing in the process. # 4. Some further codes may only accept plain-text input, and I'm not adding # support of gzip to it. args.cleanq1 = 'clean.1.fq' args.cleanq2 = 'clean.2.fq' if configurations.filter_rawdata.compress_output_in_all: args.cleanq1 += '.gz' args.cleanq2 += '.gz' if not args.disable_filter: args.fastq1, args.fastq2 = filter(args) args.fastafile = assemble(args) args.fastafile = findmitoscaf(args) if not args.disable_annotation: (args.pos_json, args.circular, args.annotated_cds, args.annotated_rna) = annotate(args) # Visualization is of no way if not annotated. args.circos_png, args.circos_svg = visualize( args) if not args.disable_visualization else (None, None) # Add command check if there's something further # If you wrapped the 'all' module in other task or workflow # the results will be retained since we don't know what you # want. if args.__calling == 'all': def move_to_result(*files): for file in files: if path.isfile(str(file)): os.rename(file, path.join( args.result_dir, path.basename(file))) # Iteratively collects all the results generated in the whole process move_to_result(args.circos_png, args.circos_svg, args.pos_json, args.fastafile, args.annotated_cds, args.annotated_rna) logger.log(2, f'Results dumped at {args.result_dir}')
def filter_taxanomy(taxa=None, fasta_file=None, hmm_frame: pandas.DataFrame = None, basedir=None, prefix=None, dbfile=None, gene_code=9, relaxing=0, threads=8): logger.log(1, f'Filtering taxanomy with tblastn.') # Extract sequences from input fasta file according to hmm frame # Do tblastn to search out the possible taxanomy of the gene blast_file = tk.tblastn_multi(dbfile=dbfile, infile=fasta_file, genetic_code=gene_code, basedir=basedir, prefix=prefix, threads=threads) blast_frame_unfiltered, _ = tk.blast_to_csv(blast_file) blast_frame = tk.wash_blast_results(blast_frame_unfiltered) # Drop the sequences which don't have even a gene related to taxa by_seqid = dict(tuple(blast_frame.groupby(['sseq']))) to_save = [] for key, frame in by_seqid.items(): is_in = False for _, row in frame.iterrows(): qseq = str(row.qseq).split('_') taxa_name = ' '.join([qseq[4], qseq[5]]) taxa_rank = get_rank(taxa_name) required_rank = get_rank(taxa) required_id = ncbi.get_name_translator([taxa])[taxa][0] required_class = ncbi.get_rank([required_id])[required_id] required_index = rank_list.index(required_class) # Get last index for the matching rank matches = [idx for idx, ((tax_id, tax_name), (required_id, required_name)) in enumerate(zip(taxa_rank, required_rank)) if required_name == tax_name != 'NA'] matches.append(-1) matched_rank = max(matches) if matched_rank + relaxing >= required_index: is_in = True break if is_in: to_save.append(key) filtered_frame = hmm_frame[hmm_frame['target'].isin(to_save)] filtered_frame.to_csv( path.join(basedir, f'{prefix}.taxa.csv'), index=False) logger.log( 1, f'{len(filtered_frame.index)} records were selected after the taxanomy filtering.') return filtered_frame
def build_lib(self): # Write reads info with open(self.read_lib, 'w') as l: fifos = [] if self.fq1 and self.fq2: print(self.fq1, self.fq2, sep=',', file=l) fq1, fq2 = (self.fq1 if not self.fq1.endswith('gz') else path.join(self.temp_dir, 'pipe.pe1'), self.fq2 if not self.fq2.endswith('gz') else path.join(self.temp_dir, 'pipe.pe2')) if self.fq1.endswith('gz'): fifo1 = path.join(self.temp_dir, 'pipe.pe1') os.mkfifo(fifo1) fifos.append( subprocess.Popen(f'gzip -dc {self.fq1} > {fifo1}', shell=True, preexec_fn=os.setsid)) if self.fq2.endswith('gz'): fifo2 = path.join(self.temp_dir, 'pipe.pe2') os.mkfifo(fifo2) fifos.append( subprocess.Popen(f'gzip -dc {self.fq2} > {fifo2}', shell=True, preexec_fn=os.setsid)) print('pe', fq1, fq2, file=l) else: print(self.fq1, file=l) fq1 = self.fq1 if not self.fq1.endswith('gz') else path.join( self.temp_dir, 'pipe.se') print('se', fq1, file=l) logger.log(1, "Converting reads to binary library.") shell_call(self.MEGAHIT_CORE, 'buildlib', self.read_lib, self.read_lib) if False in (x.wait() == 0 for x in fifos): raise RuntimeError("Error occured in reading input fifos") with open(self.read_lib + '.lib_info') as ri: info = [x.split(' ') for x in ri.readlines()] return LibInfo(info)
def tblastn_multi(dbfile=None, infile=None, genetic_code=9, basedir=None, prefix=None, threads=8): infile = path.abspath(infile) dbfile = path.abspath(dbfile) truncated_call('makeblastdb', '-in', infile, dbtype='nucl') tasks = [] protein_data_dir = path.join(basedir, 'tblastn_data') try: os.mkdir(protein_data_dir) except FileExistsError: raise RuntimeError( "Folder is already created, please make sure the working folder is clean.") logger.log(1, f'Making {threads} small datasets for calling tblastn.') tblastn_db = np.array_split(list(SeqIO.parse(dbfile, 'fasta')), threads) for idx, data in enumerate(tblastn_db): if data.any(): logger.log(0, f'Dataset {idx} has {len(data)} queries.') dataset_path = path.join(protein_data_dir, f'dataset_{idx}.fasta') SeqIO.write(data, dataset_path, 'fasta') tasks.append( f'tblastn -evalue 1e-5 -outfmt 6 -seg no -db_gencode {genetic_code} -db {infile} -query {dataset_path}') logger.log(1, f'Generating map for calling tblastn.') pool = multiprocessing.Pool(processes=threads) out_blast = path.join(path.abspath(basedir), f'{prefix}.blast') with open(out_blast, 'w') as f: pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x))) logger.log(1, f'Waiting for all processes to finish.') pool.close() pool.join() logger.log(1, f'Cleaning generated temp files.') shell_call('rm -r', protein_data_dir) os.remove(f'{infile}.nhr') os.remove(f'{infile}.nin') os.remove(f'{infile}.nsq') return out_blast
def remap_sequence(prefix=None, basedir=None, fasta_file=None, fastq1=None, fastq2=None, threads=8): # Remap sequence back to the fastq file # This can be a non-trival task, so a partial of threads are # given to samtools view and samtools sort. logger.log(2, "Mapping fastq reads back onto fasta file.") shell_call('bwa index', fasta_file) bam_file = path.join(basedir, f'{prefix}.bam') check_output( f'bwa mem -t {max(1, int(threads*0.75))} {fasta_file} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -q 30 -h -@ {max(1, int(threads*0.25))} -o {bam_file} -', shell=True) bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam') check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}', shell=True) logger.log(2, "Calculating average depth for each sequence.") gene_depth_file = path.join(basedir, f'{prefix}.dep') avgdep_bin = path.join(path.abspath(path.dirname(__file__)), 'avgdep_bin') check_output( f'samtools depth -aa {bam_sorted_file} |{avgdep_bin} -o {gene_depth_file}', shell=True) mapping = {k: v for k, v in map(str.split, open(gene_depth_file))} logger.log(2, "Retagging sequences for latter processing.") sequences = [] for seq in SeqIO.parse(fasta_file, 'fasta'): seq.description = f"flag=1 multi={mapping[seq.id]}" sequences.append(seq) SeqIO.write(sequences, path.join(basedir, path.basename(fasta_file)), 'fasta') return fasta_file
def bwa_map(threads: int, fasta_file: str, basedir: str, prefix: str, fastq1: str, fastq2: str, quality: int = 30) -> Tuple[str, str, str]: index = path.join(basedir, prefix) direct_call(f'bwa index -p {index} {fasta_file}') fq1, fq2 = path.join( basedir, prefix + '.1.fq'), path.join(basedir, prefix + '.2.fq') if fastq2 is not None else None bam = path.join(basedir, prefix + ".bam") logger.log(2, "Mapping and extracting reads from bwa mem.") direct_call(f'\ bwa mem -t {threads} {index} {fastq1} {fastq2 if fastq2 is not None else ""} |\ samtools view -bS -q {quality} -h - |\ tee {bam}|\ samtools fastq -1 {fq1} {f"-2 {fq2}" if fq2 is not None else ""} -') return bam, fq1, fq2
def overlapped(mapping: list): def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip(a, b) for gene_loc, pair_loc in pairwise(mapping): dist = max(gene_loc.seqfrom, gene_loc.seqto) - \ min(pair_loc.seqfrom, pair_loc.seqto) if gene_loc != pair_loc and dist >= overlap_cutoff and (dist <= gene_loc.length or dist <= pair_loc.length): if gene_loc.score >= pair_loc.score: logger.log( 0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {pair_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}') while pair_loc in mapping: mapping.remove(pair_loc) else: logger.log( 0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {gene_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}') while gene_loc in mapping: mapping.remove(gene_loc) return True return False
def filter(self, kmer=None, min_depth=3, min_length=0, max_length=20000, force_filter=False, deny_number=a_conf.filter_keep) -> Tuple[int, int, int]: logger.log(2, f'Filtering output contig files of k = {kmer}') results = [0, 0, 0] if not a_conf.no_filter or force_filter: for idx, suffix in enumerate( ['.contigs.fa', '.addi.fa', '.bubble_seq.fa']): if path.exists(self._contig_prefix(kmer) + suffix): results[idx] = int( shell_call(self.FAST_FILTER, i=self._contig_prefix(kmer) + suffix, o=self._contig_prefix(kmer) + '.filtered' + suffix, l=f"{min_length},{max_length}", d=min_depth)) if results[idx] <= deny_number and idx == 0: results[idx] = int( shell_call(self.FAST_FILTER, i=self._contig_prefix(kmer) + suffix, o=self._contig_prefix(kmer) + '.filtered' + suffix, l=f"{min_length},{max_length}", m=deny_number)) shell_call( 'mv', self._contig_prefix(kmer) + '.filtered' + suffix, self._contig_prefix(kmer) + suffix) return tuple(results)
def blastn_multi(dbfile=None, infile=None, basedir=None, prefix=None, threads=8): infile = path.abspath(infile) dbfile = path.abspath(dbfile) truncated_call('makeblastdb', '-in', infile, dbtype='nucl') nucl_data_dir = path.join(basedir, "blastn_data") try: os.mkdir(nucl_data_dir) except FileExistsError: raise RuntimeError("Folder is already created, please make sure the working folder is clean.") logger.log(1, f'Making {threads} small datasets for calling blastn.') file_names = [path.join(nucl_data_dir, f'dataset_{x}.fasta') for x in range(threads)] tasks = [f'blastn -evalue 1e-5 -outfmt 6 -db {infile} -query {dataset_path}' for dataset_path in file_names] seqs = [[] for i in range(threads)] for i, seq in enumerate(SeqIO.parse(dbfile, 'fasta')): seqs[i % threads].append(seq) for i in range(threads): SeqIO.write(seqs[i], file_names[i], 'fasta') logger.log(1, 'Generating map for calling blastn.') pool = multiprocessing.Pool(processes=threads) out_blast = path.join(path.abspath(basedir), f'{prefix}.blast') with open(out_blast, 'w') as f: pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x))) pool.close() logger.log(1, "Waiting for all processes to finish.") pool.join() logger.log(1, f'Cleaning generated temp files.') shell_call('rm -r', nucl_data_dir) os.remove(f'{infile}.nhr') os.remove(f'{infile}.nin') os.remove(f'{infile}.nsq') return out_blast
def nhmmer_search(fasta_file=None, thread_number=None, nhmmer_profile=None, prefix=None, basedir=None): logger.log(1, 'Calling nhmmer.') # Call nhmmer hmm_out = os.path.join(basedir, f'{prefix}.nhmmer.out') hmm_tbl = os.path.join(basedir, f'{prefix}.nhmmer.tblout') logger.log(1, f'Out file : o={hmm_out}, tbl={hmm_tbl}') shell_call('nhmmer', o=hmm_out, tblout=hmm_tbl, cpu=thread_number, appending=[nhmmer_profile, fasta_file]) # Process data to pandas readable table hmm_tbl_pd = f'{hmm_tbl}.readable' with open(hmm_tbl, 'r') as fin, open(hmm_tbl_pd, 'w') as fout: for line in fin: striped = line.strip() splitted = striped.split() # Dispose the description of genes, god damned nhmmer... print(' '.join(splitted[:15]), file=fout) # Read table with pandas hmm_frame = pandas.read_csv(hmm_tbl_pd, comment='#', delimiter=' ', names=[ 'target', 'accession1', 'query', 'accession2', 'hmmfrom', 'hmm to', 'alifrom', 'alito', 'envfrom', 'envto', 'sqlen', 'strand', 'e', 'score', 'bias' ]) hmm_frame = hmm_frame.drop(columns=['accession1', 'accession2']) # Deduplicate multiple hits on the same gene of same sequence hmm_frame = hmm_frame.drop_duplicates( subset=['target', 'query'], keep='first') hmm_frame.to_csv(f'{hmm_tbl}.dedup.csv', index=False) logger.log(1, f'HMM query have {len(hmm_frame.index)} results.') return hmm_frame
def annotate(basedir=None, prefix=None, ident=30, fastafile=None, genetic_code=9, clade=None, thread_number=8, wildcard_profile=False, trna_overlapping=40, hmmer_search=True, score=5, e_value=0.005): logger.log(2, 'Entering annotation module.') if wildcard_profile: logger.log( 3, 'Wildcard protein profile is used, results may not be accurate.') # Once we can confirm the sequences are from the clade we want to, # then we don't need to use overall database. if wildcard_profile: # Copying the code because I'm lazy here logger.log(2, 'Updating the general protein database.') lc = 0 with open(path.join(profile_dir_tbn, 'Animal.fa'), 'w') as fout: for protein_fas in os.listdir(profile_dir_tbn): if protein_fas.endswith('.fa') and protein_fas != 'Animal.fa': with open(path.join(profile_dir_tbn, protein_fas)) as fin: for line in fin: fout.write(line) lc += 1 logger.log(1, f'Generation finished with {lc} writes.') tbn_profile = path.join( profile_dir_tbn, f'{clade if not wildcard_profile else "Animal"}.fa') blast_file = tk.tblastn_multi(dbfile=tbn_profile, infile=fastafile, genetic_code=genetic_code, basedir=basedir, prefix=prefix, threads=thread_number) blast_frame, _ = tk.blast_to_csv(blast_file, ident=ident, score=score) try: washed_frame = tk.wash_blast_results(blast_frame) except Exception: raise RuntimeError( f"Empty blast frame while annotation, annotation can't continue. Please check the {fastafile} .") if configurations.annotation.redirection: logger.log(2, 'Checking genome directions.') if tk.redirect_genome(fasta_file=fastafile, blast_frame=blast_frame): # Not fixing frame directly because I'm lazy, since later I'll need to fix tRNA and rRNA frames if so... logger.log(2, "Genome is reversed, launching a second annotation to fix gene locations.") blast_file = tk.tblastn_multi(dbfile=tbn_profile, infile=fastafile, genetic_code=genetic_code, basedir=basedir, prefix=prefix, threads=thread_number) blast_frame, _ = tk.blast_to_csv(blast_file, ident=ident, score=score) washed_frame = tk.wash_blast_results(blast_frame) wise_frame, _, _ = tk.genewise( basedir=basedir, prefix=prefix, wises=washed_frame, infile=fastafile, dbfile=tbn_profile, cutoff=0.5) # Add an extra washing here, since Pandas will have some # strange behaviour processing data this large... # Also prevent some mutation of ['plus'] wise_frame = tk.wash_blast_results(wise_frame, mut_plus=False) taxa_data = {} for _, row in wise_frame.iterrows(): splited = str(row.qseq).split('_') PCG = splited[3] taxa_name = ' '.join(splited[4:6]) taxa_score = float(row.score) if PCG not in taxa_data or taxa_data[PCG][1] < taxa_score: taxa_data[PCG] = (taxa_name, taxa_score) score_data = {} for _, (taxa_name, taxa_score) in taxa_data.items(): score_data[taxa_name] = score_data[taxa_name] + \ 1 if taxa_name in score_data else 0 most_possible = list(score_data.keys())[0] for taxa_name, taxa_score in score_data.items(): if score_data[most_possible] < taxa_score: most_possible = taxa_name logger.log(2, f'Determined most possible species : {most_possible}') if configurations.annotation.reloc_genes: logger.log(2, 'Relocating genes.') wise_frame = tk.reloc_genes(fasta_file=fastafile, wises=wise_frame, code=genetic_code) cds_indexes = {} cds_found = [] with open(path.join(profile_dir_hmm, 'required_cds.json')) as f: cds_indexes = json.load(f)[clade] for _, row in wise_frame.iterrows(): cds = str(row).split('_')[3] cds_found.append(cds) hmmer_frame = None cds_notfound = [x for x in cds_indexes if x not in cds_found] logger.log(2, f'PCGs found in annotation : {cds_found}') if cds_notfound and not hmmer_search: logger.log(3, f'Expected PCG {cds_notfound} not found!') elif cds_notfound and hmmer_search: logger.log( 3, f'Expected PCG {cds_notfound} not found, turning to nhmmer search.' ) hmmer_frame = tk.nhmmer_search(fasta_file=fastafile, thread_number=thread_number, nhmmer_profile=profile_dir_hmm + f'/{clade}.hmm', prefix=prefix, basedir=basedir) hmmer_frame = hmmer_frame[~hmmer_frame['query'].isin(cds_found)] hmmer_frame = hmmer_frame[hmmer_frame['e'] < e_value] hmmer_frame = hmmer_frame[hmmer_frame['score'] > score] logger.log(2, 'Recovered pcgs : \n' + str(hmmer_frame)) trna_out_dir = path.join(basedir, 'trna') os.makedirs(trna_out_dir, exist_ok=True) # Disable some annoying warning with warnings.catch_warnings(): warnings.simplefilter('ignore', BiopythonWarning) query_dict, missing_trna = tk.trna_search( fastafile, profile_dir_trna, trna_out_dir, prefix, genetic_code, 0.01, overlap_cutoff=trna_overlapping) logger.log(2, f'tRNAs found : {list(query_dict.keys())}') if missing_trna: logger.log(3, f'Missing tRNAs : {missing_trna}') rrna_out_dir = path.join(basedir, 'rrna') os.makedirs(rrna_out_dir, exist_ok=True) result_12, result_16 = tk.rrna_search( fastafile, profile_dir_rrna, rrna_out_dir, prefix, 0.01) if not result_12: logger.log(3, '12s rRNA is not found!') if not result_16: logger.log(3, '16s rRNA is not found!') locs_file = path.join(basedir, 'locs.json') annotation_json = {} sequence_data = {x.id: x for x in SeqIO.parse(fastafile, 'fasta')} annotated_fa = path.join(basedir, f'{prefix}.annotated.cds.fa') annotated_frag = [] start = end = -1 for _, row in wise_frame.iterrows(): cds = str(row).split('_')[3] if cds in annotation_json: count = sum(x.startswith(cds) for x in annotation_json.keys()) cds = f'{cds}{"_" if count > 0 else ""}{count}' start, end = (min(int(row.wise_min_start), int(row.wise_max_end)), max(int(row.wise_min_start), int(row.wise_max_end))) frag = sequence_data[str(row.sseq)][start - 1:end] frag.description = f'gene={cds} start={start} end={end} from={row.sseq} strand={"+" if row.plus else "-"}' annotated_frag.append(frag) annotation_json[cds] = (start, end, 0, str( row.sseq), "+" if row.plus else "-") if hmmer_frame is not None: for _, row in hmmer_frame.iterrows(): start, end = (min(int(row.envfrom), int(row.envto)), max(int(row.envfrom), int(row.envto))) frag = sequence_data[str(row.target)][start - 1:end] frag.description = f'gene={str(row.query)} start={start} end={end} from={row.target} strand={row.strand}' annotated_frag.append(frag) annotation_json[str(row.query)] = ( start, end, 0, str(row.target), str(row.strand)) SeqIO.write(annotated_frag, annotated_fa, 'fasta') annotated_rnas = path.join(basedir, f'{prefix}.annotated.rna.fa') annotated_frag.clear() for key, value in query_dict.items(): start, end = (min(value.seqfrom, value.seqto), max(value.seqfrom, value.seqto)) frag = sequence_data[value.sequence][start - 1:end] frag.description = f'gene=trn{key} start={start} end={end}' annotated_frag.append(frag) annotation_json[f'trn{key}'] = ( start, end, 1, value.sequence, '+' if value.plus else '-') if result_12: start, end = (min(result_12.seqfrom, result_12.seqto), max(result_12.seqfrom, result_12.seqto)) logger.log( 2, f'12s rRNA found from {start} to {end}') frag = sequence_data[result_12.sequence][start - 1:end] frag.description = f'gene=rrnS start={start} end={end}' annotated_frag.append(frag) annotation_json['rrnS'] = ( start, end, 2, result_12.sequence, '+' if result_12.plus else '-') if result_16: start, end = (min(result_16.seqfrom, result_16.seqto), max(result_16.seqfrom, result_16.seqto)) logger.log( 2, f'16s rRNA found from {start} to {end}') frag = sequence_data[result_16.sequence][start - 1:end] frag.description = f'gene=rrnL start={start} end={end}' annotated_frag.append(frag) annotation_json['rrnL'] = ( start, end, 2, result_16.sequence, '+' if result_16.plus else '-') SeqIO.write(annotated_frag, annotated_rnas, 'fasta') with open(locs_file, 'w') as f: json.dump(annotation_json, f, indent=4, separators=(',', ': ')) return locs_file, annotated_fa, annotated_rnas
def visualize(fasta_file=None, fastq1=None, fastq2=None, pos_json=None, prefix=None, basedir=None, threads=8, circular=False): logger.log(2, 'Entering visualize module.') # Validate the paths fasta_file = path.abspath(fasta_file) fastq1 = path.abspath(fastq1) if fastq2 != None: fastq2 = path.abspath(fastq2) basedir = path.abspath(basedir) pos_json = path.abspath(pos_json) fa_copy = path.join(basedir, f'{prefix}.fasta') list_conv = [] counter = 1 # Rename to a easier form index_list = {} for seq in SeqIO.parse(fasta_file, 'fasta'): index_list[seq.id] = f'mt{counter}' seq.id_old = seq.id seq.id = f'mt{counter}' seq.description = '' list_conv.append(seq) counter += 1 SeqIO.write(list_conv, fa_copy, 'fasta') with open(pos_json, 'r') as f: poses = json.load(f) # Gene name files logger.log(1, 'Generating gene name and feature files.') gene_name_file = path.join(basedir, f'{prefix}.gene.txt') with open(gene_name_file, 'w') as gn_f: for key, value in poses.items(): start, end, gene_type, strand, _ = value strand_conv = index_list[strand] print(strand_conv, start, end, key.split('_')[0] if '_' in key else key, sep='\t', file=gn_f) # Gene feature files gene_feature_file = path.join(basedir, f'{prefix}.features.txt') with open(gene_feature_file, 'w') as gf_f: for key, value in poses.items(): start, end, gene_type, strand, plus = value plus = plus == '+' r0 = 0.965 if plus else 1 r1 = 1 if plus else 1.035 strand_conv = index_list[strand] print(strand_conv, start, start, f'fill_color=black,r0={r0}r,r1={r1}r', file=gf_f, sep='\t') print( strand_conv, start, end, f'fill_color={circos_config.fill_colors[int(gene_type)]},r0={r0}r,r1={r1}r', file=gf_f, sep='\t') print(strand_conv, end, end, f'fill_color=black,r0={r0}r,r1={r1}r', file=gf_f, sep='\t') logger.log(1, 'Generating depth files.') # Using check_output directly because being too lazy to remove decoder from subprocess import check_output shell_call('bwa index', fa_copy) bam_file = path.join(basedir, f'{prefix}.bam') mem_count = max(int(threads * 0.8), 1) view_count = max(threads - mem_count, 1) check_output( f'bwa mem -t {mem_count} {fa_copy} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -@ {view_count} -q 30 -h -o {bam_file} -', shell=True) bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam') check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}', shell=True) gene_depth_file = path.join(basedir, f'{prefix}.dep') check_output(f'samtools depth -aa {bam_sorted_file} > {gene_depth_file}', shell=True) # Calculate the things circos_depth_file = path.join(basedir, f'{prefix}.depth.txt') max_gene_depth = 0 with open(gene_depth_file, 'r') as gdf, open(circos_depth_file, 'w') as cdf: for line in gdf: content = str(line).rstrip().split() print(' '.join([content[0], content[1], content[1], content[2]]), file=cdf) if int(content[2]) > max_gene_depth: max_gene_depth = int(content[2]) # GC content # Reusing conv-list here, as it's not deleted in the scope gc_content_file = path.join(basedir, f'{prefix}.gc.txt') with open(gc_content_file, 'w') as gc_f: for seq in list_conv: # Stepping 50 to walk through for s in range(0, len(seq), 50): seq_slice = seq[s:s + 50] gc_num = sum(x == 'G' or x == 'C' for x in seq_slice) gc_per = gc_num / len(seq_slice) print(seq.id, s, s + len(seq_slice), gc_per, file=gc_f) # Karyotype logger.log(1, 'Generating chr files.') karyotype_file = path.join(basedir, f'{prefix}.karyotype.txt') with open(karyotype_file, 'w') as ky_f: for seq in list_conv: chr_name = seq.id.replace('mt', 'chr') print(f'{chr_name} - {seq.id}\t{seq.id_old}\t0\t{len(seq)}\tgrey', file=ky_f) # Plus generation logger.log(1, 'Generating plus.') plus_file = path.join(basedir, f'{prefix}.plus.txt') with open(plus_file, 'w') as p_f: print('mt1\t0\t300\t+\tr0=1r-150p,r1=1r-100p', file=p_f) # Giving the values logger.log(1, 'Generating circos config file.') generated_config = circos_config.circos_conf generated_config.ideogram.spacing._break = "0.5r" if not circular else "0.01r" generated_config.image.dir = basedir generated_config.karyotype = karyotype_file generated_config.plots['plot', 0].file = gene_name_file generated_config.plots['plot', 1].file = plus_file generated_config.plots['plot', 2].file = gc_content_file with generated_config.plots['plot', 3] as depth_plot: depth_plot.file = circos_depth_file depth_plot.max = max_gene_depth depth_plot.rules[ 'rule', 0].condition = f'var(value) > {int(max_gene_depth*0.9)}' depth_plot.rules[ 'rule', 1].condition = f'var(value) < {int(max_gene_depth*0.1)}' generated_config.highlights['highlight', 0].file = gene_feature_file # Writing to final # I guess it would be better to use a f-string formatted cfg, but # well this is fine. cfg_dict = circos.collapse(generated_config) cfg_file = path.join(basedir, 'circos.conf') with open(cfg_file, 'w') as cfg_f: cfg_f.write('<<include etc/colors_fonts_patterns.conf>>\n') cfg_f.write(circos.dict2circos(cfg_dict) + '\n') cfg_f.write('<<include etc/housekeeping.conf>>') logger.log(1, 'Running Circos.') try: check_output('circos', shell=True, cwd=basedir) except Exception: logger.log(4, "Running circos errored, no graph is outputted!") return path.join(basedir, 'Circos.png'), path.join(basedir, 'Circos.svg')
def pre(args): # Initialize the logger. if hasattr(args, 'work_dir') and hasattr(args, 'workname'): logger.init(path.join(args.work_dir, f'{args.workname}.log')) else: logger.init(path.join(os.getcwd(), 'summary.log')) if hasattr(args, 'level'): logger.set_level(args.level) logger.log( 2, f'MitoFlex {VERSION}, run {args.workname if hasattr(args, "workname") else "1"}') arg_dict = vars(args) logger.log(2, f'Arguments after parsed : ') logger.log(2, f'{[f"{key}={value}" for key, value in arg_dict.items()]}') if hasattr(args, 'disable_filter') and args.disable_filter: logger.log(3, 'Filtering is not enabled, files will only be truncated.') if hasattr(args, 'disable_annotation') and args.disable_annotation: logger.log(3, 'Annotation is not enabled.') def runtime_error_logger(exception_type, value, tb): if exception_type == RuntimeError: logger.log(4, value) logger.log( 4, 'A RuntimeError was occured. This is already considered in the code' ', but since it\'s thought to be errors in parts outside the MitoFlex can handle, it\'s' ' NOT a bug caused by MitoFlex itself. Please check the error message' ' and try to fix the possible cause of the crash, only as a last resort, send ' 'github a issue with a rerun with logger level set to 0.' ) logger.finalize() sys.exit() else: if exception_type != KeyboardInterrupt: logger.log( 4, "An unexpected error was happened in the MitoFlex, this could be a bug in the program," " so please report it if you see this message in log.") logger.log( 4, f"Error type : {exception_type.__name__}, value : {value}") logger.log( 4, f"Traceback :") logger.__log('\n'.join(traceback.format_tb(tb=tb))) logger.log(4, "Logging additional information") import psutil curp = psutil.Process() logger.log(4, curp.open_files()) logger.log(4, curp.environ()) logger.log(4, curp.memory_full_info()) logger.log(4, "Logging ignored logs.") for l in logger.__ignored: logger.log(4, l) else: logger.log(2, "This run was terminated manually.") logger.finalize() sys.__excepthook__(exception_type, value, tb) sys.excepthook = runtime_error_logger
def load_modules(args): try: logger.log(2, 'Loading filter module.') from filter.filter import filter_pe, filter_se logger.log(2, 'Loading assemble module.') from assemble.assemble import assemble logger.log(2, 'Loading findmitoscaf module.') from findmitoscaf.findmitoscaf import findmitoscaf logger.log(2, 'Loading annotation module.') from annotation.annotation import annotate logger.log(2, 'Loading visualize module.') from visualize.visualize import visualize except Exception: logger.log(4, 'Cannot load module!') finally: logger.log(2, 'All modules are loaded correctly.')
def runtime_error_logger(exception_type, value, tb): if exception_type == RuntimeError: logger.log(4, value) logger.log( 4, 'A RuntimeError was occured. This is already considered in the code' ', but since it\'s thought to be errors in parts outside the MitoFlex can handle, it\'s' ' NOT a bug caused by MitoFlex itself. Please check the error message' ' and try to fix the possible cause of the crash, only as a last resort, send ' 'github a issue with a rerun with logger level set to 0.' ) logger.finalize() sys.exit() else: if exception_type != KeyboardInterrupt: logger.log( 4, "An unexpected error was happened in the MitoFlex, this could be a bug in the program," " so please report it if you see this message in log.") logger.log( 4, f"Error type : {exception_type.__name__}, value : {value}") logger.log( 4, f"Traceback :") logger.__log('\n'.join(traceback.format_tb(tb=tb))) logger.log(4, "Logging additional information") import psutil curp = psutil.Process() logger.log(4, curp.open_files()) logger.log(4, curp.environ()) logger.log(4, curp.memory_full_info()) logger.log(4, "Logging ignored logs.") for l in logger.__ignored: logger.log(4, l) else: logger.log(2, "This run was terminated manually.") logger.finalize() sys.__excepthook__(exception_type, value, tb)
def trna_search(fasta_file=None, profile_dir=None, basedir=None, prefix=None, gene_code=9, e_value=0.001, overlap_cutoff=40): # Make sure it's the absolute path fasta_file = path.abspath(fasta_file) profile_dir = path.abspath(profile_dir) basedir = path.abspath(basedir) codon_table = CodonTable.generic_by_id[gene_code] forward_table = codon_table.forward_table infernal_file = path.join(basedir, f'{prefix}.infernal.out') query_results = [] for idx, cm in enumerate(os.listdir(profile_dir)): indexed = f'{infernal_file}.{idx}' truncated_call('cmsearch', E=e_value, o=indexed, appending=[ path.join(profile_dir, cm), fasta_file]) query_results.append(infernal.Infernal(indexed)) gene_map = [] for result in query_results: for align in result.alignments: loop = align.alignment # Get the main loop of tRNA main = [x for x in loop.components if isinstance( x, wuss.MultiLoop)] if not main: continue main = main[0] # Get the three hairpin loops of the main loop hairpins = [x for x in main.components if isinstance( x, wuss.HairpinLoop)] if len(hairpins) < 2: continue # Get the center hairpin loop (anticodon arm) # No gap is allowed center = hairpins[1] if len(center.hairpin.sequence) != 7: continue # Can't read the center tri-base codon if '-' in center.hairpin.to_str()[2:5]: logger.log( 1, f'Unqualified fold discarded, central hairpin : {center.hairpin.to_str()}, sequence : {center.sequence}') continue code = Seq(center.hairpin.to_str()[2:5]).reverse_complement() amino = forward_table[code] align.amino = amino align.length = max(align.seqfrom, align.seqto) - \ min(align.seqfrom, align.seqto) gene_map.append((align.seqfrom, align)) gene_map.append((align.seqto, align)) gene_map.sort(key=lambda x: x[0]) gene_map = [x[1] for x in gene_map] # Then find the most possible of all def overlapped(mapping: list): def pairwise(iterable): a, b = tee(iterable) next(b, None) return zip(a, b) for gene_loc, pair_loc in pairwise(mapping): dist = max(gene_loc.seqfrom, gene_loc.seqto) - \ min(pair_loc.seqfrom, pair_loc.seqto) if gene_loc != pair_loc and dist >= overlap_cutoff and (dist <= gene_loc.length or dist <= pair_loc.length): if gene_loc.score >= pair_loc.score: logger.log( 0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {pair_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}') while pair_loc in mapping: mapping.remove(pair_loc) else: logger.log( 0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {gene_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}') while gene_loc in mapping: mapping.remove(gene_loc) return True return False while overlapped(gene_map): pass gene_map = list(set(gene_map)) # Normalize the results query_dict = {} for gene in gene_map: if gene.amino not in query_dict: query_dict[gene.amino] = gene else: query_dict[gene.amino + str(sum(x.startswith(gene.amino) for x in query_dict.keys()) + 1)] = gene missing_trnas = [ x for x in codon_table.back_table if x not in query_dict and x] return query_dict, missing_trnas
def filter_se(fqiabs=None, fqoabs=None, Ns=10, quality=55, limit=0.2, start=None, end=None, trim=0, trunc=False): fsin = path.getsize(fqiabs) logger.log(level=1, info='Start filtering single-end rawdata.') logger.log(level=0, info=f'Input file has {fsin} bytes.') logger.log(level=1, info=f'Using argument : Ns={Ns}, quality={quality}, limit={limit}, start={start}, end={end}, trimming={trim}, trunc={trunc}') try: shell_call(path.join(filter_dir, 'filter_v2'), cleanq1=f'"{fqoabs}"', fastq1=f'"{fqiabs}"', n=Ns, q=quality, l=limit, s=start, e=end, t=trim, truncate_only=trunc) except Exception as identifier: logger.log( level=4, info=f'Error occured when running filter, cause : {identifier}') logger.log(level=1, info=f'Input file : {fqiabs}') logger.log(level=1, info=f'Output file : {fqoabs}') sys.exit("Error occured when running filter!") fsot = path.getsize(fqoabs) logger.log(level=0, info=f'Output file has {fsot} bytes.') logger.log(level=0, info=f'Filtered {fsin - fsot} bytes, ratio {fsot/fsin}.') return fqoabs
def filter_pe(fq1=None, fq2=None, o1=None, o2=None, dedup=False, start=None, end=None, n=10, q=55, l=0.2, trim=0, trunc=False): fsin1, fsin2 = path.getsize(fq1), path.getsize(fq2) logger.log(level=1, info='Start filtering pair-end rawdata.') logger.log( level=0, info=f'Input file 1 has {fsin1} bytes, 2 has {fsin2} bytes.') if fsin1 != fsin2: logger.log( level=3, info=f'Input file 1 and 2 have different sizes! This could cause loss on rawdata, or even crash the program.') logger.log( level=1, info=f'Using argument : Ns={n}, quality={q}, start={start}, end={end},limit={l}, trimming={trim}') try: shell_call(path.join(filter_dir, 'filter_v2'), _1=f'"{fq1}"', _2=f'"{fq2}"', _3=f'"{o1}"', _4=f'"{o2}"', d=dedup, s=start, e=end, n=n, q=q, l=l, t=trim, truncate_only=trunc) except Exception as identifier: logger.log( level=4, info=f'Error occured when running filter, cause : {identifier}') logger.log(level=1, info=f'Input file : {fq1} , {fq2}') logger.log(level=1, info=f'Output file : {o1} , {o2}') sys.exit("Error occured when running filter!") fsot1 = path.getsize(o1) logger.log(level=0, info=f'Output file has {fsot1} bytes.') logger.log(level=1, info=f'Filtered {fsin1 - fsot1} bytes, ratio {100*fsot1/fsin1:.2f}%.') return o1, o2
def initialize(self): self.basedir = path.abspath(self.basedir) self.fq1 = path.abspath(self.fq1) if self.fq2: self.fq2 = path.abspath(self.fq2) # Check if POPCNT command is supported if self.use_popcnt: if shell_call('megahit_core checkpopcnt').rstrip() != '1': self.use_popcnt = False logger.log(3, "POPCNT is disabled since no features detected.") else: self.hwaccel = shell_call( "megahit_core checkcpu").rstrip() == '1' logger.log( 2, f"Using megahit with {'POPCNT' if not self.hwaccel else 'hardware acceleration'} support." ) else: logger.log(2, "POPCNT disabled by argument.") if self.one_pass: logger.log(3, "Using 1-pass mode.") self.result_dir = safe_makedirs( path.join(self.basedir, f'{self.prefix}.result'), False) if not path.isdir(str(a_conf.external_temp)): self.temp_dir = safe_makedirs( path.join(self.basedir, f'{self.prefix}.temp'), False) else: self.temp_dir = safe_makedirs( path.join(a_conf.external_temp, str(uuid.uuid4()), f'{self.prefix}.temp'), False) self.read_lib = path.join(self.temp_dir, 'reads.lib') self.contig_dir = safe_makedirs( path.join(self.temp_dir, 'intermediate_contigs'), False) vm = psutil.virtual_memory() logger.log( 1, f"System memory status : {', '.join([f'{k}={v/(1024**2):.2f}MB' for k,v in vm._asdict().items() if type(v) is int])}" ) self.available_memory = int(vm.available * a_conf.max_mem_percent) logger.log( 2, f'Scheduled {self.available_memory/(1024**2):.2f}MB to use.')