Exemple #1
0
def findmitoscaf(args):

    if args.__calling == 'findmitoscaf':

        if not args.from_megahit:
            logger.log(2, 'Remapping reads to contigs since contigs are not assembled from pipeline.')
            fastfilter_bin = path.abspath(path.join(path.dirname(__file__), 'assemble', 'fastfilter'))
            filtered_fasta = path.join(args.findmitoscaf_dir, f'{args.workname}.filtered.fa')
            shell_call(fastfilter_bin, i=args.fastafile, o=filtered_fasta,
                       l=f"{configurations.assemble.min_length},{configurations.assemble.max_length}",
                       d=0)
            fq1, fq2 = args.fastq1, args.fastq2
            if not (fq1 or fq2):
                raise RuntimeError("At least one fastq file should be specified!")
            if not fq1:
                fq1, fq2 = fq2, fq1
            # Remapping to calculate average depth.
            from findmitoscaf.findmitoscaf import remap_sequence
            args.fastafile = remap_sequence(args.workname, args.findmitoscaf_dir, filtered_fasta, args.fastq1, args.fastq2, args.threads)
        else:
            logger.log(2, "Remapping skipped since from-megahit is specified, no tagging needed.")

    from findmitoscaf.findmitoscaf import findmitoscaf as _findmitoscaf
    picked_fa = _findmitoscaf(
        thread_number=args.threads, clade=args.clade, relaxing=args.taxa_tolerance, gene_code=args.genetic_code,
        multi=args.min_abundance, taxa=args.required_taxa if not args.disable_taxa else None,
        prefix=args.workname, basedir=args.findmitoscaf_dir, contigs_file=args.fastafile,
        merge_method=args.merge_method, merge_overlapping=args.merge_overlap, merge_search=args.merge_start)

    # Further processing for calling directly
    if args.__calling == 'findmitoscaf':
        os.rename(picked_fa, path.join(
            args.result_dir, path.basename(picked_fa)))
    return picked_fa
Exemple #2
0
def get_rank(taxa_name=None):
    name_dict = ncbi.get_name_translator([taxa_name])

    if taxa_name not in name_dict:
        # Try to parse the gene name
        taxa_name = taxa_name.split(' ')[0]
        name_dict = ncbi.get_name_translator([taxa_name])

    rank_dict = {
        'kindom': 'NA',
        'phylum': 'NA',
        'class': 'NA',
        'order': 'NA',
        'family': 'NA',
        'genus': 'NA',
        'species': 'NA'
    }

    if taxa_name in name_dict:
        for taxid in ncbi.get_lineage(name_dict[taxa_name][0]):
            rank = ncbi.get_rank([taxid])[taxid]
            taxa = ncbi.get_taxid_translator([taxid])[taxid]
            if rank in rank_dict:
                rank_dict[rank] = taxa
    else:
        logger.log(
            2, f'Query name {taxa_name} was skipped because no result found in NCBI database.')

    return [(tax_class, tax_id) for tax_class, tax_id in rank_dict.items()]
 def local(self, current_kmer, next_kmer):
     logger.log(2, f'Local assembly for k = {current_kmer}')
     shell_call(self.MEGAHIT_CORE,
                'local',
                c=self._contig_prefix(current_kmer) + '.contigs.fa',
                l=self.read_lib,
                t=self.threads,
                o=self._contig_prefix(current_kmer) + '.local.fa',
                kmax=next_kmer)
Exemple #4
0
def bim(args):
    # Also a WIP idea.
    # MITObim uses MIRA as mapper and assembler, which is clearly outperformed by
    # bwa alongwith the modified MEGAHIT. If we can reuse current pipeline, then
    # we surely can make a more powerful MITObim.
    # raise RuntimeError("This module is still work in progress, in later versions it may be completed.")

    args.cleanq1 = 'clean.1.fq'
    args.cleanq2 = 'clean.2.fq'

    if configurations.filter_rawdata.compress_output_in_all:
        args.cleanq1 += '.gz'
        args.cleanq2 += '.gz'

    if not args.disable_filter:
        args.fastq1, args.fastq2 = filter(args)

    from bim.bim import bwa_map, cal_insert
    from assemble.assemble import assemble

    fasta_path = path.join(args.temp_dir, f'{args.workname}.bait.fa')
    shutil.copy(args.fastafile, fasta_path)
    args.fastafile = fasta_path

    for i in range(args.max_iteration):
        logger.log(2, f"Iteration {i} starts.")

        if len(os.listdir(args.assemble_dir)) != 0:
            logger.log(2, f"Removing data in previous iteration.")
            os.system(f"rm -rf {args.assemble_dir}/*")

        bam, fq1, fq2 = bwa_map(args.threads, args.fastafile, args.assemble_dir, args.workname, args.fastq1, args.fastq2)
        if args.insert_size_auto:
            args.insert_size = cal_insert(bam, args.assemble_dir, args.workname)

        next_generation = assemble(
            threads=args.threads, base_dir=args.assemble_dir, work_prefix=args.workname,
            fastq1=fq1, fastq2=fq2, disable_local=args.disable_local,
            prune_level=args.prune_level, prune_depth=args.prune_depth, keep_temp=args.keep_temp,
            insert_size=args.insert_size, no_scaf=args.disable_scaffolding or i % (args.scaffolding_spare + 1) != 0,
            kmer_list=args.kmer_list, depth_list=args.depth_list)

        if args.iteration_ignore < i:
            # Criteria of breaking the cycle:
            # 1. No extension can be made after an iteration.
            # 2. Genome assembled currently possessed of enough
            #    quality, and passed some tests.

            args.from_megahit = True
            filtered_seq = findmitoscaf(args)

        next_fasta = path.join(args.temp_dir, f'{args.workname}.bait.fa')
        os.rename(next_generation, next_fasta)
        args.fastafile = next_fasta
Exemple #5
0
def cal_insert(bam: str, basedir: str, prefix: str) -> int:
    stat_file = path.join(basedir, prefix + ".stats")
    log(2, "Measuring insert size of alignments.")
    stats = [[int(y) for y in x.split("\t")][:2] for x in direct_call(f'\
        samtools stats {bam}|\
        tee {stat_file}|\
        grep ^IS|\
        cut -f 2-').split("\n") if x]
    avg_ins = sum(a * b for a, b in stats) / sum(list(zip(*stats))[1])
    log(2, f"Measured insert size is {avg_ins}")
    return avg_ins
Exemple #6
0
def post(args):

    if args is None:
        return
    if hasattr(args, 'keep_temp') and not args.keep_temp and args.__calling != 'filter' and hasattr(args, 'cleanq1'):
        # Not removing until here since cleanq1 and cleanq2 have many other usage other than assembling
        logger.log(1, 'Removing filtered data files.')
        os.remove(args.cleanq1)
        if args.fastq2 != None:
            os.remove(args.cleanq2)
    logger.log(2, f'All done! Time elapsed : {time.time()-start_time:.2f}s.')
    logger.finalize()
Exemple #7
0
def fix_circular(fa_file: str):

    genome = [x for x in SeqIO.parse(fa_file, 'fasta')]
    if len(genome) != 1:
        return False
    info, seq = list(check_circular(final_seqs=genome))[0]
    if info is not None:
        logger.log(2, f'An overlapped region was found starting at {info[0]} with length {info[2]}. Trimming it.')
        seq = seq[info[0]:len(seq) - 500 + info[1]]
        SeqIO.write([seq], fa_file, 'fasta')
        return True

    return False
 def iterate(self, current_kmer, next_kmer):
     logger.log(
         2,
         f'Extracting iterative edges from k = {current_kmer} to {next_kmer}'
     )
     shell_call(self.MEGAHIT_CORE,
                'iterate',
                c=self._contig_prefix(current_kmer) + '.contigs.fa',
                b=self._contig_prefix(current_kmer) + '.bubble_seq.fa',
                t=self.threads,
                s=next_kmer - current_kmer,
                o=self._graph_prefix(next_kmer),
                r=self.read_lib + '.bin',
                k=current_kmer)
    def graph(self, current_kmer, next_kmer):
        options = {
            'k': next_kmer,
            'host_mem': self.available_memory,
            'mem_flag': 1,
            'output_prefix': self._graph_prefix(next_kmer),
            'num_cpu_threads': self.threads,
            'need_mercy': not self.no_mercy and current_kmer == self.kmin,
            'kmer_from': current_kmer,
            'useconv': False
        }

        if current_kmer == 0:  # Indicating it's the first graph
            if not self.one_pass:
                logger.log(2, f"Extracting solid (k+1)-mers for k={next_kmer}")
                count_opts = options.copy()
                count_opts['m'] = self.min_multi
                count_opts['read_lib_file'] = self.read_lib
                count_opts.pop('need_mercy')
                count_opts.pop('kmer_from')
                logger.log(0, f"Extract options : {count_opts}")
                shell_call(self.MEGAHIT_CORE, 'count', **count_opts)

        file_size = 0

        if path.exists(self._graph_prefix(next_kmer) + '.edges.0'):
            options['input_prefix'] = self._graph_prefix(next_kmer)
            file_size += path.getsize(
                self._graph_prefix(next_kmer) + '.edges.0')

        if path.exists(self._contig_prefix(current_kmer) + '.addi.fa'):
            options['addi_contig'] = \
                self._contig_prefix(current_kmer) + '.addi.fa'
            file_size += path.getsize(
                self._contig_prefix(current_kmer) + '.addi.fa')

        if path.exists(self._contig_prefix(current_kmer) + '.local.fa'):
            options['local_contig'] = \
                self._contig_prefix(current_kmer) + '.local.fa'
            file_size += path.getsize(
                self._contig_prefix(current_kmer) + '.addi.fa')

        if path.exists(self._contig_prefix(current_kmer) + '.contigs.fa'):
            options['contig'] = \
                self._contig_prefix(current_kmer) + '.contigs.fa'
            options['bubble'] = \
                self._contig_prefix(current_kmer) + '.bubble_seq.fa'
            file_size += path.getsize(
                self._contig_prefix(current_kmer) + '.contigs.fa')

        if file_size == 0 and current_kmer != 0:
            raise EmptyGraph

        logger.log(2, f'Building graph for k={next_kmer}')
        logger.log(0, f'Build options : {options}')

        shell_call(self.MEGAHIT_CORE, 'seq2sdbg', **options)

        if file_size != 0 and current_kmer != 0 and not self.keep_temp:
            os.system(f"rm -r {path.join(self.temp_dir, f'k{current_kmer}')}")
    def assemble(self, kmer) -> Tuple[ContigInfo, ContigInfo]:
        min_standalone = max(
            min(self.kmax * 3 - 1, int(self.min_length * 1.5)),
            self.min_length)

        options = {
            's':
            self._graph_prefix(kmer),
            'o':
            self._contig_prefix(kmer),
            't':
            self.threads,
            'min_standalone':
            min_standalone,
            'prune_level':
            self.prune_level,
            'merge_len':
            20,
            'merge_similar':
            0.95,
            'cleaning_rounds':
            5,
            'disconnect_ratio':
            0.1,
            'low_local_ratio':
            0.2,
            'min_depth':
            self.prune_depth,
            'bubble_level':
            2,
            'max_tip_len':
            max(1, self.min_length * 1.5 + 1 -
                kmer) if kmer * 3 - 1 > self.min_length * 1.5 else -1,
            'careful_bubble':
            kmer < self.kmax,
            'is_final_round':
            kmer == self.kmax,
            'output_standalone':
            self.no_local,
            'useconv':
            False
        }

        logger.log(2, f'Assembling contigs from SdBG for k = {kmer}')
        logger.log(0, f'Assemble arguments : {options}')

        shell_call(self.MEGAHIT_CORE, 'assemble', **options)
        with open(self._contig_prefix(kmer) + '.contigs.fa.info', 'r') as c, \
                open(self._contig_prefix(kmer) + '.addi.fa.info', 'r') as a:
            return ContigInfo(c), ContigInfo(a)
    def scaf(self) -> str:
        if self.lib_file == None:
            raise RuntimeError("Lib was not build before scaffolding!")

        kmer = int(self.read_length / 2)
        prefix = path.join(self.basedir, f'k{kmer}')

        # Prepare
        logger.log(2, "Constructing graph for SOAPdenovo-127.")
        shell_call(soap_fusion,
                   D=True,
                   s=self.lib_file,
                   p=self.threads,
                   K=kmer,
                   g=prefix,
                   c=self.contigs)

        # Map
        logger.log(2, "Mapping sequences.")
        shell_call(soap_127, 'map', s=self.lib_file, p=self.threads, g=prefix)

        # Scaff
        logger.log(2, "Scaffolding.")
        shell_call(soap_127, 'scaff', p=self.threads, g=prefix)

        # Convert
        logger.log(2, "Converting output scaffolds back.")
        scaf2mega(prefix + '.scafSeq',
                  path.join(path.dirname(self.contigs), 'scaf.fa'),
                  overlay=kmer)
        return path.join(path.dirname(self.contigs), 'scaf.fa')
Exemple #12
0
def all(args):

    # Go filtering
    #
    # Why I'm NOT using .gz ext here even I have implemented this:
    # 1. flate2 is slow, it takes much compressing data if single-threaded.
    # 2. plug in a SSD is much more easier than adding a CPU.
    # 3. Some method uses only plain text data, so you need an extra (de)compression
    #    but it means nothing in the process.
    # 4. Some further codes may only accept plain-text input, and I'm not adding
    #    support of gzip to it.

    args.cleanq1 = 'clean.1.fq'
    args.cleanq2 = 'clean.2.fq'
    if configurations.filter_rawdata.compress_output_in_all:
        args.cleanq1 += '.gz'
        args.cleanq2 += '.gz'

    if not args.disable_filter:
        args.fastq1, args.fastq2 = filter(args)

    args.fastafile = assemble(args)
    args.fastafile = findmitoscaf(args)

    if not args.disable_annotation:
        (args.pos_json, args.circular,
         args.annotated_cds, args.annotated_rna) = annotate(args)

        # Visualization is of no way if not annotated.
        args.circos_png, args.circos_svg = visualize(
            args) if not args.disable_visualization else (None, None)

    # Add command check if there's something further
    # If you wrapped the 'all' module in other task or workflow
    # the results will be retained since we don't know what you
    # want.
    if args.__calling == 'all':
        def move_to_result(*files):
            for file in files:
                if path.isfile(str(file)):
                    os.rename(file, path.join(
                        args.result_dir, path.basename(file)))
        # Iteratively collects all the results generated in the whole process
        move_to_result(args.circos_png, args.circos_svg,
                       args.pos_json, args.fastafile,
                       args.annotated_cds, args.annotated_rna)
        logger.log(2, f'Results dumped at {args.result_dir}')
Exemple #13
0
def filter_taxanomy(taxa=None, fasta_file=None, hmm_frame: pandas.DataFrame = None, basedir=None,
                    prefix=None, dbfile=None, gene_code=9, relaxing=0, threads=8):

    logger.log(1, f'Filtering taxanomy with tblastn.')
    # Extract sequences from input fasta file according to hmm frame

    # Do tblastn to search out the possible taxanomy of the gene
    blast_file = tk.tblastn_multi(dbfile=dbfile, infile=fasta_file,
                                  genetic_code=gene_code, basedir=basedir, prefix=prefix, threads=threads)
    blast_frame_unfiltered, _ = tk.blast_to_csv(blast_file)

    blast_frame = tk.wash_blast_results(blast_frame_unfiltered)

    # Drop the sequences which don't have even a gene related to taxa
    by_seqid = dict(tuple(blast_frame.groupby(['sseq'])))
    to_save = []
    for key, frame in by_seqid.items():
        is_in = False
        for _, row in frame.iterrows():
            qseq = str(row.qseq).split('_')
            taxa_name = ' '.join([qseq[4], qseq[5]])
            taxa_rank = get_rank(taxa_name)
            required_rank = get_rank(taxa)
            required_id = ncbi.get_name_translator([taxa])[taxa][0]
            required_class = ncbi.get_rank([required_id])[required_id]
            required_index = rank_list.index(required_class)
            # Get last index for the matching rank
            matches = [idx
                       for idx, ((tax_id, tax_name), (required_id, required_name))
                       in enumerate(zip(taxa_rank, required_rank))
                       if required_name == tax_name != 'NA']
            matches.append(-1)
            matched_rank = max(matches)
            if matched_rank + relaxing >= required_index:
                is_in = True
                break
        if is_in:
            to_save.append(key)

    filtered_frame = hmm_frame[hmm_frame['target'].isin(to_save)]
    filtered_frame.to_csv(
        path.join(basedir, f'{prefix}.taxa.csv'), index=False)
    logger.log(
        1, f'{len(filtered_frame.index)} records were selected after the taxanomy filtering.')
    return filtered_frame
    def build_lib(self):

        # Write reads info
        with open(self.read_lib, 'w') as l:
            fifos = []

            if self.fq1 and self.fq2:
                print(self.fq1, self.fq2, sep=',', file=l)
                fq1, fq2 = (self.fq1 if not self.fq1.endswith('gz') else
                            path.join(self.temp_dir, 'pipe.pe1'),
                            self.fq2 if not self.fq2.endswith('gz') else
                            path.join(self.temp_dir, 'pipe.pe2'))

                if self.fq1.endswith('gz'):
                    fifo1 = path.join(self.temp_dir, 'pipe.pe1')
                    os.mkfifo(fifo1)
                    fifos.append(
                        subprocess.Popen(f'gzip -dc {self.fq1} > {fifo1}',
                                         shell=True,
                                         preexec_fn=os.setsid))

                if self.fq2.endswith('gz'):
                    fifo2 = path.join(self.temp_dir, 'pipe.pe2')
                    os.mkfifo(fifo2)
                    fifos.append(
                        subprocess.Popen(f'gzip -dc {self.fq2} > {fifo2}',
                                         shell=True,
                                         preexec_fn=os.setsid))

                print('pe', fq1, fq2, file=l)
            else:
                print(self.fq1, file=l)
                fq1 = self.fq1 if not self.fq1.endswith('gz') else path.join(
                    self.temp_dir, 'pipe.se')
                print('se', fq1, file=l)

        logger.log(1, "Converting reads to binary library.")
        shell_call(self.MEGAHIT_CORE, 'buildlib', self.read_lib, self.read_lib)

        if False in (x.wait() == 0 for x in fifos):
            raise RuntimeError("Error occured in reading input fifos")

        with open(self.read_lib + '.lib_info') as ri:
            info = [x.split(' ') for x in ri.readlines()]
            return LibInfo(info)
def tblastn_multi(dbfile=None, infile=None, genetic_code=9, basedir=None,
                  prefix=None, threads=8):

    infile = path.abspath(infile)
    dbfile = path.abspath(dbfile)

    truncated_call('makeblastdb', '-in', infile, dbtype='nucl')

    tasks = []

    protein_data_dir = path.join(basedir, 'tblastn_data')

    try:
        os.mkdir(protein_data_dir)
    except FileExistsError:
        raise RuntimeError(
            "Folder is already created, please make sure the working folder is clean.")

    logger.log(1, f'Making {threads} small datasets for calling tblastn.')
    tblastn_db = np.array_split(list(SeqIO.parse(dbfile, 'fasta')), threads)
    for idx, data in enumerate(tblastn_db):
        if data.any():
            logger.log(0, f'Dataset {idx} has {len(data)} queries.')
            dataset_path = path.join(protein_data_dir, f'dataset_{idx}.fasta')
            SeqIO.write(data, dataset_path, 'fasta')
            tasks.append(
                f'tblastn -evalue 1e-5 -outfmt 6 -seg no -db_gencode {genetic_code} -db {infile} -query {dataset_path}')
    logger.log(1, f'Generating map for calling tblastn.')
    pool = multiprocessing.Pool(processes=threads)

    out_blast = path.join(path.abspath(basedir), f'{prefix}.blast')
    with open(out_blast, 'w') as f:
        pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x)))
        logger.log(1, f'Waiting for all processes to finish.')
        pool.close()
        pool.join()

    logger.log(1, f'Cleaning generated temp files.')
    shell_call('rm -r', protein_data_dir)
    os.remove(f'{infile}.nhr')
    os.remove(f'{infile}.nin')
    os.remove(f'{infile}.nsq')
    return out_blast
Exemple #16
0
def remap_sequence(prefix=None, basedir=None, fasta_file=None, fastq1=None, fastq2=None, threads=8):

    # Remap sequence back to the fastq file
    # This can be a non-trival task, so a partial of threads are
    # given to samtools view and samtools sort.
    logger.log(2, "Mapping fastq reads back onto fasta file.")
    shell_call('bwa index', fasta_file)
    bam_file = path.join(basedir, f'{prefix}.bam')
    check_output(
        f'bwa mem -t {max(1, int(threads*0.75))} {fasta_file} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -q 30 -h -@ {max(1, int(threads*0.25))} -o {bam_file} -', shell=True)
    bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam')
    check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}', shell=True)

    logger.log(2, "Calculating average depth for each sequence.")
    gene_depth_file = path.join(basedir, f'{prefix}.dep')
    avgdep_bin = path.join(path.abspath(path.dirname(__file__)), 'avgdep_bin')
    check_output(
        f'samtools depth -aa {bam_sorted_file} |{avgdep_bin} -o {gene_depth_file}', shell=True)

    mapping = {k: v for k, v in map(str.split, open(gene_depth_file))}

    logger.log(2, "Retagging sequences for latter processing.")
    sequences = []
    for seq in SeqIO.parse(fasta_file, 'fasta'):
        seq.description = f"flag=1 multi={mapping[seq.id]}"
        sequences.append(seq)
    SeqIO.write(sequences, path.join(basedir, path.basename(fasta_file)), 'fasta')

    return fasta_file
Exemple #17
0
def bwa_map(threads: int,
            fasta_file: str,
            basedir: str,
            prefix: str,
            fastq1: str,
            fastq2: str,
            quality: int = 30) -> Tuple[str, str, str]:
    index = path.join(basedir, prefix)
    direct_call(f'bwa index -p {index} {fasta_file}')
    fq1, fq2 = path.join(
        basedir,
        prefix + '.1.fq'), path.join(basedir, prefix +
                                     '.2.fq') if fastq2 is not None else None
    bam = path.join(basedir, prefix + ".bam")
    logger.log(2, "Mapping and extracting reads from bwa mem.")
    direct_call(f'\
        bwa mem -t {threads} {index} {fastq1} {fastq2 if fastq2 is not None else ""} |\
        samtools view -bS -q {quality} -h - |\
        tee {bam}|\
        samtools fastq -1 {fq1} {f"-2 {fq2}" if fq2 is not None else ""} -')

    return bam, fq1, fq2
    def overlapped(mapping: list):
        def pairwise(iterable):
            a, b = tee(iterable)
            next(b, None)
            return zip(a, b)

        for gene_loc, pair_loc in pairwise(mapping):
            dist = max(gene_loc.seqfrom, gene_loc.seqto) - \
                min(pair_loc.seqfrom, pair_loc.seqto)
            if gene_loc != pair_loc and dist >= overlap_cutoff and (dist <= gene_loc.length or dist <= pair_loc.length):
                if gene_loc.score >= pair_loc.score:
                    logger.log(
                        0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {pair_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}')
                    while pair_loc in mapping:
                        mapping.remove(pair_loc)
                else:
                    logger.log(
                        0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {gene_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}')
                    while gene_loc in mapping:
                        mapping.remove(gene_loc)
                return True
        return False
    def filter(self,
               kmer=None,
               min_depth=3,
               min_length=0,
               max_length=20000,
               force_filter=False,
               deny_number=a_conf.filter_keep) -> Tuple[int, int, int]:
        logger.log(2, f'Filtering output contig files of k = {kmer}')

        results = [0, 0, 0]
        if not a_conf.no_filter or force_filter:
            for idx, suffix in enumerate(
                ['.contigs.fa', '.addi.fa', '.bubble_seq.fa']):
                if path.exists(self._contig_prefix(kmer) + suffix):
                    results[idx] = int(
                        shell_call(self.FAST_FILTER,
                                   i=self._contig_prefix(kmer) + suffix,
                                   o=self._contig_prefix(kmer) + '.filtered' +
                                   suffix,
                                   l=f"{min_length},{max_length}",
                                   d=min_depth))

                    if results[idx] <= deny_number and idx == 0:
                        results[idx] = int(
                            shell_call(self.FAST_FILTER,
                                       i=self._contig_prefix(kmer) + suffix,
                                       o=self._contig_prefix(kmer) +
                                       '.filtered' + suffix,
                                       l=f"{min_length},{max_length}",
                                       m=deny_number))

                    shell_call(
                        'mv',
                        self._contig_prefix(kmer) + '.filtered' + suffix,
                        self._contig_prefix(kmer) + suffix)

        return tuple(results)
def blastn_multi(dbfile=None, infile=None, basedir=None, prefix=None, threads=8):
    infile = path.abspath(infile)
    dbfile = path.abspath(dbfile)

    truncated_call('makeblastdb', '-in', infile, dbtype='nucl')

    nucl_data_dir = path.join(basedir, "blastn_data")

    try:
        os.mkdir(nucl_data_dir)
    except FileExistsError:
        raise RuntimeError("Folder is already created, please make sure the working folder is clean.")

    logger.log(1, f'Making {threads} small datasets for calling blastn.')

    file_names = [path.join(nucl_data_dir, f'dataset_{x}.fasta') for x in range(threads)]

    tasks = [f'blastn -evalue 1e-5 -outfmt 6 -db {infile} -query {dataset_path}' for dataset_path in file_names]
    seqs = [[] for i in range(threads)]

    for i, seq in enumerate(SeqIO.parse(dbfile, 'fasta')):
        seqs[i % threads].append(seq)

    for i in range(threads):
        SeqIO.write(seqs[i], file_names[i], 'fasta')

    logger.log(1, 'Generating map for calling blastn.')
    pool = multiprocessing.Pool(processes=threads)

    out_blast = path.join(path.abspath(basedir), f'{prefix}.blast')
    with open(out_blast, 'w') as f:
        pool.map_async(direct_call, tasks, callback=lambda x: f.write(''.join(x)))
        pool.close()
        logger.log(1, "Waiting for all processes to finish.")
        pool.join()

    logger.log(1, f'Cleaning generated temp files.')

    shell_call('rm -r', nucl_data_dir)
    os.remove(f'{infile}.nhr')
    os.remove(f'{infile}.nin')
    os.remove(f'{infile}.nsq')

    return out_blast
def nhmmer_search(fasta_file=None, thread_number=None, nhmmer_profile=None,
                  prefix=None, basedir=None):

    logger.log(1, 'Calling nhmmer.')

    # Call nhmmer
    hmm_out = os.path.join(basedir, f'{prefix}.nhmmer.out')
    hmm_tbl = os.path.join(basedir, f'{prefix}.nhmmer.tblout')
    logger.log(1, f'Out file : o={hmm_out}, tbl={hmm_tbl}')
    shell_call('nhmmer', o=hmm_out, tblout=hmm_tbl,
               cpu=thread_number, appending=[nhmmer_profile, fasta_file])

    # Process data to pandas readable table
    hmm_tbl_pd = f'{hmm_tbl}.readable'
    with open(hmm_tbl, 'r') as fin, open(hmm_tbl_pd, 'w') as fout:
        for line in fin:
            striped = line.strip()
            splitted = striped.split()
            # Dispose the description of genes, god damned nhmmer...
            print(' '.join(splitted[:15]), file=fout)

    # Read table with pandas
    hmm_frame = pandas.read_csv(hmm_tbl_pd, comment='#', delimiter=' ',
                                names=[
                                    'target', 'accession1', 'query',
                                    'accession2', 'hmmfrom', 'hmm to',
                                    'alifrom', 'alito', 'envfrom', 'envto',
                                    'sqlen', 'strand', 'e', 'score',
                                    'bias'
                                ])
    hmm_frame = hmm_frame.drop(columns=['accession1', 'accession2'])

    # Deduplicate multiple hits on the same gene of same sequence
    hmm_frame = hmm_frame.drop_duplicates(
        subset=['target', 'query'], keep='first')
    hmm_frame.to_csv(f'{hmm_tbl}.dedup.csv', index=False)

    logger.log(1, f'HMM query have {len(hmm_frame.index)} results.')
    return hmm_frame
Exemple #22
0
def annotate(basedir=None, prefix=None, ident=30, fastafile=None,
             genetic_code=9, clade=None, thread_number=8,
             wildcard_profile=False, trna_overlapping=40, hmmer_search=True, score=5, e_value=0.005):
    logger.log(2, 'Entering annotation module.')
    if wildcard_profile:
        logger.log(
            3, 'Wildcard protein profile is used, results may not be accurate.')

    # Once we can confirm the sequences are from the clade we want to,
    # then we don't need to use overall database.
    if wildcard_profile:
        # Copying the code because I'm lazy here
        logger.log(2, 'Updating the general protein database.')
        lc = 0
        with open(path.join(profile_dir_tbn, 'Animal.fa'), 'w') as fout:
            for protein_fas in os.listdir(profile_dir_tbn):
                if protein_fas.endswith('.fa') and protein_fas != 'Animal.fa':
                    with open(path.join(profile_dir_tbn, protein_fas)) as fin:
                        for line in fin:
                            fout.write(line)
                            lc += 1
        logger.log(1, f'Generation finished with {lc} writes.')

    tbn_profile = path.join(
        profile_dir_tbn, f'{clade if not wildcard_profile else "Animal"}.fa')

    blast_file = tk.tblastn_multi(dbfile=tbn_profile, infile=fastafile, genetic_code=genetic_code,
                                  basedir=basedir, prefix=prefix, threads=thread_number)
    blast_frame, _ = tk.blast_to_csv(blast_file, ident=ident, score=score)

    try:
        washed_frame = tk.wash_blast_results(blast_frame)
    except Exception:
        raise RuntimeError(
            f"Empty blast frame while annotation, annotation can't continue. Please check the {fastafile} .")

    if configurations.annotation.redirection:
        logger.log(2, 'Checking genome directions.')
        if tk.redirect_genome(fasta_file=fastafile, blast_frame=blast_frame):
            # Not fixing frame directly because I'm lazy, since later I'll need to fix tRNA and rRNA frames if so...
            logger.log(2, "Genome is reversed, launching a second annotation to fix gene locations.")
            blast_file = tk.tblastn_multi(dbfile=tbn_profile, infile=fastafile, genetic_code=genetic_code,
                                          basedir=basedir, prefix=prefix, threads=thread_number)
            blast_frame, _ = tk.blast_to_csv(blast_file, ident=ident, score=score)
            washed_frame = tk.wash_blast_results(blast_frame)

    wise_frame, _, _ = tk.genewise(
        basedir=basedir, prefix=prefix, wises=washed_frame,
        infile=fastafile, dbfile=tbn_profile, cutoff=0.5)

    # Add an extra washing here, since Pandas will have some
    # strange behaviour processing data this large...
    # Also prevent some mutation of ['plus']
    wise_frame = tk.wash_blast_results(wise_frame, mut_plus=False)

    taxa_data = {}
    for _, row in wise_frame.iterrows():
        splited = str(row.qseq).split('_')
        PCG = splited[3]
        taxa_name = ' '.join(splited[4:6])
        taxa_score = float(row.score)

        if PCG not in taxa_data or taxa_data[PCG][1] < taxa_score:
            taxa_data[PCG] = (taxa_name, taxa_score)

    score_data = {}
    for _, (taxa_name, taxa_score) in taxa_data.items():
        score_data[taxa_name] = score_data[taxa_name] + \
            1 if taxa_name in score_data else 0

    most_possible = list(score_data.keys())[0]
    for taxa_name, taxa_score in score_data.items():
        if score_data[most_possible] < taxa_score:
            most_possible = taxa_name

    logger.log(2, f'Determined most possible species : {most_possible}')

    if configurations.annotation.reloc_genes:
        logger.log(2, 'Relocating genes.')
        wise_frame = tk.reloc_genes(fasta_file=fastafile,
                                    wises=wise_frame, code=genetic_code)

    cds_indexes = {}
    cds_found = []
    with open(path.join(profile_dir_hmm, 'required_cds.json')) as f:
        cds_indexes = json.load(f)[clade]

    for _, row in wise_frame.iterrows():
        cds = str(row).split('_')[3]
        cds_found.append(cds)

    hmmer_frame = None

    cds_notfound = [x for x in cds_indexes if x not in cds_found]
    logger.log(2, f'PCGs found in annotation : {cds_found}')
    if cds_notfound and not hmmer_search:
        logger.log(3, f'Expected PCG {cds_notfound} not found!')
    elif cds_notfound and hmmer_search:
        logger.log(
            3, f'Expected PCG {cds_notfound} not found, turning to nhmmer search.'
        )
        hmmer_frame = tk.nhmmer_search(fasta_file=fastafile, thread_number=thread_number,
                                       nhmmer_profile=profile_dir_hmm + f'/{clade}.hmm', prefix=prefix, basedir=basedir)
        hmmer_frame = hmmer_frame[~hmmer_frame['query'].isin(cds_found)]
        hmmer_frame = hmmer_frame[hmmer_frame['e'] < e_value]
        hmmer_frame = hmmer_frame[hmmer_frame['score'] > score]
        logger.log(2, 'Recovered pcgs : \n' + str(hmmer_frame))

    trna_out_dir = path.join(basedir, 'trna')
    os.makedirs(trna_out_dir, exist_ok=True)

    # Disable some annoying warning
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', BiopythonWarning)
        query_dict, missing_trna = tk.trna_search(
            fastafile, profile_dir_trna, trna_out_dir, prefix, genetic_code, 0.01, overlap_cutoff=trna_overlapping)

    logger.log(2, f'tRNAs found : {list(query_dict.keys())}')
    if missing_trna:
        logger.log(3, f'Missing tRNAs : {missing_trna}')

    rrna_out_dir = path.join(basedir, 'rrna')
    os.makedirs(rrna_out_dir, exist_ok=True)
    result_12, result_16 = tk.rrna_search(
        fastafile, profile_dir_rrna, rrna_out_dir, prefix, 0.01)

    if not result_12:
        logger.log(3, '12s rRNA is not found!')

    if not result_16:
        logger.log(3, '16s rRNA is not found!')

    locs_file = path.join(basedir, 'locs.json')
    annotation_json = {}

    sequence_data = {x.id: x for x in SeqIO.parse(fastafile, 'fasta')}

    annotated_fa = path.join(basedir, f'{prefix}.annotated.cds.fa')
    annotated_frag = []
    start = end = -1
    for _, row in wise_frame.iterrows():
        cds = str(row).split('_')[3]
        if cds in annotation_json:
            count = sum(x.startswith(cds) for x in annotation_json.keys())
            cds = f'{cds}{"_" if count > 0 else ""}{count}'
        start, end = (min(int(row.wise_min_start), int(row.wise_max_end)),
                      max(int(row.wise_min_start), int(row.wise_max_end)))
        frag = sequence_data[str(row.sseq)][start - 1:end]
        frag.description = f'gene={cds} start={start} end={end} from={row.sseq} strand={"+" if row.plus else "-"}'
        annotated_frag.append(frag)
        annotation_json[cds] = (start, end, 0, str(
            row.sseq), "+" if row.plus else "-")

    if hmmer_frame is not None:
        for _, row in hmmer_frame.iterrows():
            start, end = (min(int(row.envfrom), int(row.envto)),
                          max(int(row.envfrom), int(row.envto)))
            frag = sequence_data[str(row.target)][start - 1:end]
            frag.description = f'gene={str(row.query)} start={start} end={end} from={row.target} strand={row.strand}'
            annotated_frag.append(frag)
            annotation_json[str(row.query)] = (
                start, end, 0, str(row.target), str(row.strand))

    SeqIO.write(annotated_frag, annotated_fa, 'fasta')

    annotated_rnas = path.join(basedir, f'{prefix}.annotated.rna.fa')
    annotated_frag.clear()
    for key, value in query_dict.items():
        start, end = (min(value.seqfrom, value.seqto),
                      max(value.seqfrom, value.seqto))
        frag = sequence_data[value.sequence][start - 1:end]
        frag.description = f'gene=trn{key} start={start} end={end}'
        annotated_frag.append(frag)
        annotation_json[f'trn{key}'] = (
            start, end, 1, value.sequence, '+' if value.plus else '-')

    if result_12:
        start, end = (min(result_12.seqfrom, result_12.seqto),
                      max(result_12.seqfrom, result_12.seqto))
        logger.log(
            2, f'12s rRNA found from {start} to {end}')
        frag = sequence_data[result_12.sequence][start - 1:end]
        frag.description = f'gene=rrnS start={start} end={end}'
        annotated_frag.append(frag)
        annotation_json['rrnS'] = (
            start, end, 2, result_12.sequence, '+' if result_12.plus else '-')

    if result_16:
        start, end = (min(result_16.seqfrom, result_16.seqto),
                      max(result_16.seqfrom, result_16.seqto))
        logger.log(
            2, f'16s rRNA found from {start} to {end}')
        frag = sequence_data[result_16.sequence][start - 1:end]
        frag.description = f'gene=rrnL start={start} end={end}'
        annotated_frag.append(frag)
        annotation_json['rrnL'] = (
            start, end, 2, result_16.sequence, '+' if result_16.plus else '-')

    SeqIO.write(annotated_frag, annotated_rnas, 'fasta')
    with open(locs_file, 'w') as f:
        json.dump(annotation_json, f, indent=4, separators=(',', ': '))

    return locs_file, annotated_fa, annotated_rnas
Exemple #23
0
def visualize(fasta_file=None,
              fastq1=None,
              fastq2=None,
              pos_json=None,
              prefix=None,
              basedir=None,
              threads=8,
              circular=False):
    logger.log(2, 'Entering visualize module.')
    # Validate the paths
    fasta_file = path.abspath(fasta_file)
    fastq1 = path.abspath(fastq1)
    if fastq2 != None:
        fastq2 = path.abspath(fastq2)
    basedir = path.abspath(basedir)
    pos_json = path.abspath(pos_json)

    fa_copy = path.join(basedir, f'{prefix}.fasta')
    list_conv = []
    counter = 1

    # Rename to a easier form
    index_list = {}
    for seq in SeqIO.parse(fasta_file, 'fasta'):
        index_list[seq.id] = f'mt{counter}'
        seq.id_old = seq.id
        seq.id = f'mt{counter}'
        seq.description = ''
        list_conv.append(seq)
        counter += 1
    SeqIO.write(list_conv, fa_copy, 'fasta')

    with open(pos_json, 'r') as f:
        poses = json.load(f)

    # Gene name files
    logger.log(1, 'Generating gene name and feature files.')
    gene_name_file = path.join(basedir, f'{prefix}.gene.txt')
    with open(gene_name_file, 'w') as gn_f:
        for key, value in poses.items():
            start, end, gene_type, strand, _ = value
            strand_conv = index_list[strand]
            print(strand_conv,
                  start,
                  end,
                  key.split('_')[0] if '_' in key else key,
                  sep='\t',
                  file=gn_f)

    # Gene feature files
    gene_feature_file = path.join(basedir, f'{prefix}.features.txt')
    with open(gene_feature_file, 'w') as gf_f:
        for key, value in poses.items():
            start, end, gene_type, strand, plus = value
            plus = plus == '+'
            r0 = 0.965 if plus else 1
            r1 = 1 if plus else 1.035
            strand_conv = index_list[strand]
            print(strand_conv,
                  start,
                  start,
                  f'fill_color=black,r0={r0}r,r1={r1}r',
                  file=gf_f,
                  sep='\t')
            print(
                strand_conv,
                start,
                end,
                f'fill_color={circos_config.fill_colors[int(gene_type)]},r0={r0}r,r1={r1}r',
                file=gf_f,
                sep='\t')
            print(strand_conv,
                  end,
                  end,
                  f'fill_color=black,r0={r0}r,r1={r1}r',
                  file=gf_f,
                  sep='\t')

    logger.log(1, 'Generating depth files.')
    # Using check_output directly because being too lazy to remove decoder
    from subprocess import check_output

    shell_call('bwa index', fa_copy)
    bam_file = path.join(basedir, f'{prefix}.bam')

    mem_count = max(int(threads * 0.8), 1)
    view_count = max(threads - mem_count, 1)

    check_output(
        f'bwa mem -t {mem_count} {fa_copy} {fastq1} {fastq2 if fastq2!=None else ""} |samtools view -bS -@ {view_count} -q 30 -h -o {bam_file} -',
        shell=True)
    bam_sorted_file = path.join(basedir, f'{prefix}.sorted.bam')
    check_output(f'samtools sort -@ {threads} -o {bam_sorted_file} {bam_file}',
                 shell=True)
    gene_depth_file = path.join(basedir, f'{prefix}.dep')
    check_output(f'samtools depth -aa {bam_sorted_file} > {gene_depth_file}',
                 shell=True)

    # Calculate the things
    circos_depth_file = path.join(basedir, f'{prefix}.depth.txt')
    max_gene_depth = 0
    with open(gene_depth_file, 'r') as gdf, open(circos_depth_file,
                                                 'w') as cdf:
        for line in gdf:
            content = str(line).rstrip().split()
            print(' '.join([content[0], content[1], content[1], content[2]]),
                  file=cdf)
            if int(content[2]) > max_gene_depth:
                max_gene_depth = int(content[2])

    # GC content
    # Reusing conv-list here, as it's not deleted in the scope
    gc_content_file = path.join(basedir, f'{prefix}.gc.txt')
    with open(gc_content_file, 'w') as gc_f:
        for seq in list_conv:
            # Stepping 50 to walk through
            for s in range(0, len(seq), 50):
                seq_slice = seq[s:s + 50]
                gc_num = sum(x == 'G' or x == 'C' for x in seq_slice)
                gc_per = gc_num / len(seq_slice)
                print(seq.id, s, s + len(seq_slice), gc_per, file=gc_f)

    # Karyotype
    logger.log(1, 'Generating chr files.')
    karyotype_file = path.join(basedir, f'{prefix}.karyotype.txt')
    with open(karyotype_file, 'w') as ky_f:
        for seq in list_conv:
            chr_name = seq.id.replace('mt', 'chr')
            print(f'{chr_name} - {seq.id}\t{seq.id_old}\t0\t{len(seq)}\tgrey',
                  file=ky_f)

    # Plus generation
    logger.log(1, 'Generating plus.')
    plus_file = path.join(basedir, f'{prefix}.plus.txt')
    with open(plus_file, 'w') as p_f:
        print('mt1\t0\t300\t+\tr0=1r-150p,r1=1r-100p', file=p_f)

    # Giving the values
    logger.log(1, 'Generating circos config file.')
    generated_config = circos_config.circos_conf
    generated_config.ideogram.spacing._break = "0.5r" if not circular else "0.01r"
    generated_config.image.dir = basedir
    generated_config.karyotype = karyotype_file
    generated_config.plots['plot', 0].file = gene_name_file
    generated_config.plots['plot', 1].file = plus_file
    generated_config.plots['plot', 2].file = gc_content_file
    with generated_config.plots['plot', 3] as depth_plot:
        depth_plot.file = circos_depth_file
        depth_plot.max = max_gene_depth
        depth_plot.rules[
            'rule', 0].condition = f'var(value) > {int(max_gene_depth*0.9)}'
        depth_plot.rules[
            'rule', 1].condition = f'var(value) < {int(max_gene_depth*0.1)}'

    generated_config.highlights['highlight', 0].file = gene_feature_file

    # Writing to final
    # I guess it would be better to use a f-string formatted cfg, but
    # well this is fine.
    cfg_dict = circos.collapse(generated_config)
    cfg_file = path.join(basedir, 'circos.conf')
    with open(cfg_file, 'w') as cfg_f:
        cfg_f.write('<<include etc/colors_fonts_patterns.conf>>\n')
        cfg_f.write(circos.dict2circos(cfg_dict) + '\n')
        cfg_f.write('<<include etc/housekeeping.conf>>')

    logger.log(1, 'Running Circos.')
    try:
        check_output('circos', shell=True, cwd=basedir)
    except Exception:
        logger.log(4, "Running circos errored, no graph is outputted!")

    return path.join(basedir, 'Circos.png'), path.join(basedir, 'Circos.svg')
Exemple #24
0
def pre(args):

    # Initialize the logger.
    if hasattr(args, 'work_dir') and hasattr(args, 'workname'):
        logger.init(path.join(args.work_dir, f'{args.workname}.log'))
    else:
        logger.init(path.join(os.getcwd(), 'summary.log'))
    if hasattr(args, 'level'):
        logger.set_level(args.level)
    logger.log(
        2, f'MitoFlex {VERSION}, run {args.workname if hasattr(args, "workname") else "1"}')

    arg_dict = vars(args)
    logger.log(2, f'Arguments after parsed : ')
    logger.log(2, f'{[f"{key}={value}" for key, value in arg_dict.items()]}')

    if hasattr(args, 'disable_filter') and args.disable_filter:
        logger.log(3, 'Filtering is not enabled, files will only be truncated.')

    if hasattr(args, 'disable_annotation') and args.disable_annotation:
        logger.log(3, 'Annotation is not enabled.')

    def runtime_error_logger(exception_type, value, tb):
        if exception_type == RuntimeError:
            logger.log(4, value)
            logger.log(
                4,
                'A RuntimeError was occured. This is already considered in the code'
                ', but since it\'s thought to be errors in parts outside the MitoFlex can handle, it\'s'
                ' NOT a bug caused by MitoFlex itself. Please check the error message'
                ' and try to fix the possible cause of the crash, only as a last resort, send '
                'github a issue with a rerun with logger level set to 0.'
            )
            logger.finalize()
            sys.exit()
        else:
            if exception_type != KeyboardInterrupt:
                logger.log(
                    4,
                    "An unexpected error was happened in the MitoFlex, this could be a bug in the program,"
                    " so please report it if you see this message in log.")
                logger.log(
                    4, f"Error type : {exception_type.__name__}, value : {value}")
                logger.log(
                    4, f"Traceback :")
                logger.__log('\n'.join(traceback.format_tb(tb=tb)))

                logger.log(4, "Logging additional information")
                import psutil
                curp = psutil.Process()
                logger.log(4, curp.open_files())
                logger.log(4, curp.environ())
                logger.log(4, curp.memory_full_info())
                logger.log(4, "Logging ignored logs.")
                for l in logger.__ignored:
                    logger.log(4, l)
            else:
                logger.log(2, "This run was terminated manually.")
            logger.finalize()
            sys.__excepthook__(exception_type, value, tb)

    sys.excepthook = runtime_error_logger
Exemple #25
0
def load_modules(args):
    try:
        logger.log(2, 'Loading filter module.')
        from filter.filter import filter_pe, filter_se
        logger.log(2, 'Loading assemble module.')
        from assemble.assemble import assemble
        logger.log(2, 'Loading findmitoscaf module.')
        from findmitoscaf.findmitoscaf import findmitoscaf
        logger.log(2, 'Loading annotation module.')
        from annotation.annotation import annotate
        logger.log(2, 'Loading visualize module.')
        from visualize.visualize import visualize
    except Exception:
        logger.log(4, 'Cannot load module!')
    finally:
        logger.log(2, 'All modules are loaded correctly.')
Exemple #26
0
    def runtime_error_logger(exception_type, value, tb):
        if exception_type == RuntimeError:
            logger.log(4, value)
            logger.log(
                4,
                'A RuntimeError was occured. This is already considered in the code'
                ', but since it\'s thought to be errors in parts outside the MitoFlex can handle, it\'s'
                ' NOT a bug caused by MitoFlex itself. Please check the error message'
                ' and try to fix the possible cause of the crash, only as a last resort, send '
                'github a issue with a rerun with logger level set to 0.'
            )
            logger.finalize()
            sys.exit()
        else:
            if exception_type != KeyboardInterrupt:
                logger.log(
                    4,
                    "An unexpected error was happened in the MitoFlex, this could be a bug in the program,"
                    " so please report it if you see this message in log.")
                logger.log(
                    4, f"Error type : {exception_type.__name__}, value : {value}")
                logger.log(
                    4, f"Traceback :")
                logger.__log('\n'.join(traceback.format_tb(tb=tb)))

                logger.log(4, "Logging additional information")
                import psutil
                curp = psutil.Process()
                logger.log(4, curp.open_files())
                logger.log(4, curp.environ())
                logger.log(4, curp.memory_full_info())
                logger.log(4, "Logging ignored logs.")
                for l in logger.__ignored:
                    logger.log(4, l)
            else:
                logger.log(2, "This run was terminated manually.")
            logger.finalize()
            sys.__excepthook__(exception_type, value, tb)
def trna_search(fasta_file=None, profile_dir=None, basedir=None, prefix=None, gene_code=9, e_value=0.001, overlap_cutoff=40):
    # Make sure it's the absolute path
    fasta_file = path.abspath(fasta_file)
    profile_dir = path.abspath(profile_dir)
    basedir = path.abspath(basedir)

    codon_table = CodonTable.generic_by_id[gene_code]
    forward_table = codon_table.forward_table

    infernal_file = path.join(basedir, f'{prefix}.infernal.out')

    query_results = []
    for idx, cm in enumerate(os.listdir(profile_dir)):
        indexed = f'{infernal_file}.{idx}'
        truncated_call('cmsearch', E=e_value, o=indexed, appending=[
                       path.join(profile_dir, cm), fasta_file])
        query_results.append(infernal.Infernal(indexed))

    gene_map = []

    for result in query_results:
        for align in result.alignments:
            loop = align.alignment

            # Get the main loop of tRNA
            main = [x for x in loop.components if isinstance(
                x, wuss.MultiLoop)]
            if not main:
                continue
            main = main[0]

            # Get the three hairpin loops of the main loop
            hairpins = [x for x in main.components if isinstance(
                x, wuss.HairpinLoop)]
            if len(hairpins) < 2:
                continue

            # Get the center hairpin loop (anticodon arm)
            # No gap is allowed
            center = hairpins[1]
            if len(center.hairpin.sequence) != 7:
                continue

            # Can't read the center tri-base codon
            if '-' in center.hairpin.to_str()[2:5]:
                logger.log(
                    1, f'Unqualified fold discarded, central hairpin : {center.hairpin.to_str()}, sequence : {center.sequence}')
                continue

            code = Seq(center.hairpin.to_str()[2:5]).reverse_complement()
            amino = forward_table[code]

            align.amino = amino
            align.length = max(align.seqfrom, align.seqto) - \
                min(align.seqfrom, align.seqto)
            gene_map.append((align.seqfrom, align))
            gene_map.append((align.seqto, align))

    gene_map.sort(key=lambda x: x[0])

    gene_map = [x[1] for x in gene_map]

    # Then find the most possible of all
    def overlapped(mapping: list):
        def pairwise(iterable):
            a, b = tee(iterable)
            next(b, None)
            return zip(a, b)

        for gene_loc, pair_loc in pairwise(mapping):
            dist = max(gene_loc.seqfrom, gene_loc.seqto) - \
                min(pair_loc.seqfrom, pair_loc.seqto)
            if gene_loc != pair_loc and dist >= overlap_cutoff and (dist <= gene_loc.length or dist <= pair_loc.length):
                if gene_loc.score >= pair_loc.score:
                    logger.log(
                        0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {pair_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}')
                    while pair_loc in mapping:
                        mapping.remove(pair_loc)
                else:
                    logger.log(
                        0, f'conflict of {gene_loc.amino} and {pair_loc.amino}, removing {gene_loc.amino}, score:{gene_loc.score}, {pair_loc.score}, overlapping : {dist}')
                    while gene_loc in mapping:
                        mapping.remove(gene_loc)
                return True
        return False

    while overlapped(gene_map):
        pass

    gene_map = list(set(gene_map))

    # Normalize the results
    query_dict = {}
    for gene in gene_map:
        if gene.amino not in query_dict:
            query_dict[gene.amino] = gene
        else:
            query_dict[gene.amino + str(sum(x.startswith(gene.amino)
                                            for x in query_dict.keys()) + 1)] = gene

    missing_trnas = [
        x for x in codon_table.back_table if x not in query_dict and x]
    return query_dict, missing_trnas
Exemple #28
0
def filter_se(fqiabs=None, fqoabs=None, Ns=10, quality=55, limit=0.2, start=None, end=None, trim=0, trunc=False):
    fsin = path.getsize(fqiabs)
    logger.log(level=1, info='Start filtering single-end rawdata.')
    logger.log(level=0, info=f'Input file has {fsin} bytes.')
    logger.log(level=1,
               info=f'Using argument : Ns={Ns}, quality={quality}, limit={limit}, start={start}, end={end}, trimming={trim}, trunc={trunc}')
    try:
        shell_call(path.join(filter_dir, 'filter_v2'), cleanq1=f'"{fqoabs}"', fastq1=f'"{fqiabs}"',
                   n=Ns, q=quality, l=limit, s=start, e=end, t=trim, truncate_only=trunc)
    except Exception as identifier:
        logger.log(
            level=4, info=f'Error occured when running filter, cause : {identifier}')
        logger.log(level=1, info=f'Input file : {fqiabs}')
        logger.log(level=1, info=f'Output file : {fqoabs}')

        sys.exit("Error occured when running filter!")

    fsot = path.getsize(fqoabs)
    logger.log(level=0, info=f'Output file has {fsot} bytes.')
    logger.log(level=0,
               info=f'Filtered {fsin - fsot} bytes, ratio {fsot/fsin}.')

    return fqoabs
Exemple #29
0
def filter_pe(fq1=None, fq2=None, o1=None, o2=None,
              dedup=False, start=None, end=None,
              n=10, q=55, l=0.2, trim=0, trunc=False):
    fsin1, fsin2 = path.getsize(fq1), path.getsize(fq2)
    logger.log(level=1, info='Start filtering pair-end rawdata.')
    logger.log(
        level=0, info=f'Input file 1 has {fsin1} bytes, 2 has {fsin2} bytes.')
    if fsin1 != fsin2:
        logger.log(
            level=3, info=f'Input file 1 and 2 have different sizes! This could cause loss on rawdata, or even crash the program.')
    logger.log(
        level=1, info=f'Using argument : Ns={n}, quality={q}, start={start}, end={end},limit={l}, trimming={trim}')
    try:
        shell_call(path.join(filter_dir, 'filter_v2'),
                   _1=f'"{fq1}"', _2=f'"{fq2}"', _3=f'"{o1}"', _4=f'"{o2}"', d=dedup, s=start,
                   e=end, n=n, q=q, l=l, t=trim, truncate_only=trunc)
    except Exception as identifier:
        logger.log(
            level=4, info=f'Error occured when running filter, cause : {identifier}')
        logger.log(level=1, info=f'Input file : {fq1} , {fq2}')
        logger.log(level=1, info=f'Output file : {o1} , {o2}')
        sys.exit("Error occured when running filter!")

    fsot1 = path.getsize(o1)
    logger.log(level=0, info=f'Output file has {fsot1} bytes.')
    logger.log(level=1,
               info=f'Filtered {fsin1 - fsot1} bytes, ratio {100*fsot1/fsin1:.2f}%.')
    return o1, o2
    def initialize(self):
        self.basedir = path.abspath(self.basedir)
        self.fq1 = path.abspath(self.fq1)
        if self.fq2:
            self.fq2 = path.abspath(self.fq2)

        # Check if POPCNT command is supported
        if self.use_popcnt:
            if shell_call('megahit_core checkpopcnt').rstrip() != '1':
                self.use_popcnt = False
                logger.log(3, "POPCNT is disabled since no features detected.")
            else:
                self.hwaccel = shell_call(
                    "megahit_core checkcpu").rstrip() == '1'

                logger.log(
                    2,
                    f"Using megahit with {'POPCNT' if not self.hwaccel else 'hardware acceleration'} support."
                )
        else:
            logger.log(2, "POPCNT disabled by argument.")

        if self.one_pass:
            logger.log(3, "Using 1-pass mode.")

        self.result_dir = safe_makedirs(
            path.join(self.basedir, f'{self.prefix}.result'), False)

        if not path.isdir(str(a_conf.external_temp)):
            self.temp_dir = safe_makedirs(
                path.join(self.basedir, f'{self.prefix}.temp'), False)
        else:
            self.temp_dir = safe_makedirs(
                path.join(a_conf.external_temp, str(uuid.uuid4()),
                          f'{self.prefix}.temp'), False)

        self.read_lib = path.join(self.temp_dir, 'reads.lib')
        self.contig_dir = safe_makedirs(
            path.join(self.temp_dir, 'intermediate_contigs'), False)

        vm = psutil.virtual_memory()
        logger.log(
            1,
            f"System memory status : {', '.join([f'{k}={v/(1024**2):.2f}MB' for k,v in vm._asdict().items() if type(v) is int])}"
        )
        self.available_memory = int(vm.available * a_conf.max_mem_percent)
        logger.log(
            2, f'Scheduled {self.available_memory/(1024**2):.2f}MB to use.')