コード例 #1
0
 def write_fasta(self, fout, ids = [], seqs = [], write_all = False, fasta = None) -> None:
     """
     Write gRNA sequences to FASTA file.
     
     Arguments:
         fout (str): required, path to output file
         ids (list): list of IDs (str) of gRNA to write
         seqs (list): list of sequences (str) of gRNA to write, overrides ``ids``
         write_all (bool): write all gRNA seqences, overrides ``seqs`` and ``ids``
         fasta (str): optional, path to FASTA file, used for renaming gRNA
     """
     ## get relevant gRNA sequences
     if write_all: gRNA_seqs = self.flatten_gRNAseqs()
     elif seqs: gRNA_seqs = self.get_gRNAseqs_by_seq(*seqs)
     elif ids: gRNA_seqs = self.get_gRNAseqs_by_id(*ids)
     else:
         print("Either 'ids' OR 'seqs' OR 'write_all' is required. Writing empty file.")
         open(fout, "w+").write('')
         return
     ## rename sequences per fasta file (if fasta file provided)
     fasta_inv = {} if not fasta else {str(seq): k for k, seq in fasta_to_dict(fasta).items()}
     self.assign_seqid(assign_all = False)
     to_write = {fasta_inv.get(gRNA_seq.seq, gRNA_seq.id): gRNA_seq.seq for gRNA_seq in gRNA_seqs}
     if to_write:
         dict_to_fasta(to_write, fout)
     else:
         open(fout, "w+").write('')
     return
コード例 #2
0
ファイル: extend_reference.py プロジェクト: rlrq/MINORg
def extend_reference(
        feature: list,
        subfeature: list,
        fout_fasta,
        fout_gff,
        mafft="mafft",
        feature_type="mRNA",
        subfeature_type="CDS",  ## xx_type are currently unused
        thread: int = 1,
        directory=None,
        tmp=True,
        logger=None):
    feature = feature if non_string_iter(feature) else [feature]
    subfeature = subfeature if non_string_iter(subfeature) else [subfeature]
    valid_feature = [x for x in feature if os.path.exists(x)]
    valid_subfeature = [x for x in subfeature if os.path.exists(x)]
    ## raise Error for inaccessible files
    invalid_feature = [x for x in feature if x not in valid_feature]
    invalid_subfeature = [x for x in subfeature if x not in valid_subfeature]
    invalid = invalid_feature + invalid_subfeature
    if invalid: InvalidPath(','.join(invalid))
    if valid_feature and valid_subfeature:
        ## create directory
        if directory is None:
            directory = tempfile.mkdtemp()
            tmp = True
        group_by_gene(subfeature,
                      feature,
                      directory,
                      sep='.',
                      verbose=True,
                      logger=logger)
        ## align
        for fasta in [
                f for f in os.listdir(directory) if re.search("\.fasta$", f)
        ]:
            feature_name = re.search("^.+(?=\.fasta$)",
                                     os.path.basename(fasta)).group(0)
            with open(os.path.join(directory, f"{feature_name}_aln.fa"),
                      "w+") as f:
                stdout, stderr = MafftCommandline(mafft,
                                                  input=os.path.join(
                                                      directory, fasta),
                                                  quiet=True,
                                                  thread=thread)()
                f.write(stdout)
            os.remove(os.path.join(directory, fasta))
        ## convert alignment to gff
        aln_to_annotation(directory, fout=fout_gff, sep='.', outfmt="gff")
        ## combine genomic files
        seqs_feature = dict(
            itertools.chain(*[fasta_to_dict(fa).items() for fa in feature]))
        dict_to_fasta(seqs_feature, fout_fasta)
        ## remove temporary directories
        if tmp:
            import shutil
            shutil.rmtree(directory)
    return
コード例 #3
0
def get_merged_seqs(merged_f, fasta, fout, header=[], indv_i=1):
    ## get domain ranges
    dat = [x.split('\t') for x in splitlines(merged_f)]
    get = make_custom_get(dat[0])
    dat = dat[1:]
    ## parse fasta file
    seqs = fasta_to_dict(fasta)
    ## get sequences
    output = {}
    for i, entry in enumerate(dat):
        seq = seqs[get(entry,
                       "molecule")][get(entry, "start"):get(entry, "end")]
        key = '|'.join([str(x) for x in \
                        ([indv_i, get(entry, "molecule"), i + 1,
                          f"{get(entry, 'start') + 1}-{get(entry, 'end')}"])])
        output[key] = seq
    dict_to_fasta(output, fout)
    return
コード例 #4
0
ファイル: extend_reference.py プロジェクト: rlrq/MINORg
def group_by_gene(fa_cds,
                  fa_genomic,
                  directory,
                  sep='.',
                  verbose=True,
                  logger=None):
    seqs_cds = {
        seqid: seq
        for fname in fa_cds for seqid, seq in fasta_to_dict(fname).items()
    }
    seqs_genomic = {
        seqid: seq
        for fname in fa_genomic for seqid, seq in fasta_to_dict(fname).items()
    }
    genes = {seqid: [] for seqid in seqs_genomic}
    orphan_cds = []
    ## map CDS to GENOMIC
    for seqid in seqs_cds.keys():
        gene = re.match(f"^.+?(?={re.escape(sep)}[^{sep}]+$)", seqid).group(0)
        if not gene in genes:
            orphan_cds.append(seqid)
        else:
            genes[gene].append(seqid)
    ## print orphans
    orphan_genes = sorted(gene for gene, cds in genes.items() if not cds)

    def log(msg):
        if verbose and logger: logger.plain(msg)
        elif logger: logger.fplain(msg)
        elif verbose: print(msg)

    if orphan_genes:
        log(f"GENOMIC sequences without CDS: {','.join(orphan_genes)}")
    if orphan_cds:
        log(f"CDS sequences without GENOMIC: {','.join(orphan_cds)}")
    ## write
    for gene, cds in genes.items():
        to_write = {
            gene: seqs_genomic[gene],
            **{seqid_cds: seqs_cds[seqid_cds]
               for seqid_cds in cds}
        }
        dict_to_fasta(to_write, f"{directory}/{gene}.fasta")
    return
コード例 #5
0
def mask_identical(to_mask_fname, fasta_fname, fout_fname, **kwargs):
    seqs_to_mask = fasta_to_dict(to_mask_fname)
    ## separate sequences with ambiguous bases (not compatible with BLAST) and those without
    ##  - ambiguous bases typically present in scaffold-level assemblies as runs of 'N's
    standard_bases = {'A', 'T', 'G', 'C', 'U', 'a', 't', 'g', 'c', 'u'}
    unambig_to_mask = {
        seqid: seq
        for seqid, seq in seqs_to_mask.items()
        if set(seq).issubset(standard_bases)
    }
    ambig_to_mask = {
        seqid: seq
        for seqid, seq in seqs_to_mask.items() if seqid not in unambig_to_mask
    }
    ## if some (but not all) sequences don't have unambiguous bases
    if (unambig_to_mask and ambig_to_mask):
        import tempfile
        ## replace to_mask_fname with temporary file
        tmp_to_mask_fname = tempfile.mkstemp(suffix=".fasta")[1]
        ## write unambig_to_mask to file to be used for BLAST
        dict_to_fasta(unambig_to_mask, tmp_to_mask_fname)
    else:
        tmp_to_mask_fname = None
    ## start masking
    masked = []
    ## if at least one sequence doesn't have ambiguous bases
    if unambig_to_mask:
        masked.extend(
            blast_mask((tmp_to_mask_fname if tmp_to_mask_fname is not None else
                        to_mask_fname), fasta_fname, fout_fname, **kwargs))
    ## if at least one sequence has ambiguous bases
    if ambig_to_mask:
        masked.extend([
            Masked(hsp) for query_result in find_identical_in_fasta(
                ambig_to_mask, fasta_fname) for hit in query_result
            for hsp in hit.hsps
        ])
    ## remove temporary file if created
    if tmp_to_mask_fname is not None:
        os.remove(tmp_to_mask_fname)
    return masked
コード例 #6
0
 def collapse_query(self, fout_fasta=None, fout_map=None):
     import tempfile
     self.query_nr = fout_fasta if fout_fasta is not None else tempfile.mkstemp(
         dir=self.directory)[1]
     self.query_nr_map = fout_map if fout_map is not None else tempfile.mkstemp(
         dir=self.directory)[1]
     dat = fasta_to_dict(self.query)
     identicals = {
         k: set(seqid for seqid, seq in dat.items() if str(seq) == str(v))
         for k, v in dat.items()
     }
     identical_sets = set(
         map(lambda x: tuple(sorted(x)), identicals.values()))
     ## write nr sequences
     dict_to_fasta({seqids[0]: dat[seqids[0]]
                    for seqids in identical_sets}, self.query_nr)
     ## write nr mapping
     with open(self.query_nr_map, 'w') as f:
         f.write('\n'.join(['\t'.join(seqids)
                            for seqids in identical_sets]))
     return
コード例 #7
0
def remove_non_max_bitscore(fasta,
                            bedtool,
                            genes,
                            relax=False,
                            lvl=0,
                            quiet=True,
                            colnames_blast=[
                                "chrom", "start", "end", "candidate", "cstart",
                                "cend"
                            ],
                            blast_metrics=["bitscore"],
                            colnames_bed=[
                                "bed_chrom", "bed_start", "bed_end", "id",
                                "score", "strand", "source", "feature",
                                "phase", "attributes", "overlap"
                            ],
                            colnames_gff=[
                                "bed_chrom", "source", "feature", "bed_start",
                                "bed_end", "score", "strand", "phase",
                                "attributes", "overlap"
                            ],
                            bedtools='',
                            attribute_mod={}) -> None:
    """
    Remove query sequences for which the subject feature in the query-subject hit with the max bitscore is
    not a target gene/feature. This occurs in-place.
    
    Arguments:
        fasta (str): path to FASTA file of query sequences to reduce
        bedtool (:class:`BedTool`): BedTool object where BLAST hits have been intersected with
            subject GFF3 files
        genes (list): gene/feature IDs of targets
        relax (bool): retain query sequences even if max bitscore hit overlaps with non-target feature so long
            as it also overlaps with a target feature
        lvl (int): printing indentation
        quiet (bool): silence non-essential messages
        colnames_blast (list): column names of BLAST output
        blast_metrics (list): additional column names of metrics in BLAST output
        colnames_bed (list): column names if annotation intersected with is BED format
        colnames_gff (list): column names if annotation intersected with is GFF3 format
        bedtools (str): path to directory contaiing BEDTools executables if bedtool is not
            in command-search path
        attribute_mod (dict): optional, 
            required only if non-standard attriute field names are present in GFF3 files.
            Dictionary describing attribute modification.
    """
    import itertools
    pybedtools.helpers.set_bedtools_path(path=bedtools)
    printi = make_local_print(quiet=quiet,
                              printf=make_print_preindent(initial_lvl=lvl))
    genes = set(genes)
    cols_bed = colnames_blast + blast_metrics + colnames_bed
    cols_gff = colnames_blast + blast_metrics + colnames_gff
    ## make get function
    get_bed = make_custom_get(cols_bed)
    get_gff = make_custom_get(cols_gff)

    def get(data, *args, **kwargs):
        if not data:
            helper_get = get_bed
        else:
            entry_len = len(data) if not isinstance(data[0],
                                                    (list, tuple)) else len(
                                                        data[0])
            if entry_len == len(cols_bed): helper_get = get_bed
            elif entry_len == len(cols_gff): helper_get = get_gff
            else: return get_bed([], "dummy", suppress_print=True)
        return helper_get(data, *args, **kwargs)

    data = {}
    for entry in (str(bedtool).split('\n') if isinstance(
            bedtool, BedTool) else tuple(
                itertools.chain(
                    *[str(bedtool_obj).split('\n')
                      for bedtool_obj in bedtool]))):
        entry = entry.split('\t')
        if get(entry, "feature") in ("gene", "pseudogene", '.'):
            data[get(entry,
                     "candidate")] = (data.get(get(entry, "candidate"), []) +
                                      [{
                                          "ann":
                                          Annotation(get(entry, *colnames_gff),
                                                     None,
                                                     attr_mod=attribute_mod),
                                          "bitscore":
                                          get(entry, "bitscore")
                                      }])
    ## get largest bitscore for each candidate target
    max_bitscore = {
        candidate: max([entry["bitscore"] for entry in data[candidate]])
        for candidate in data
    }
    ## identify sequences to discard and print warnings if candidate has max bitscore with target and non-target
    ## note that due to the algorithm, candidates that don't overlap with ANY features will also be kept
    throw = []
    for candidate in data:
        # max_bitscore_genes = set(get(entry, "id") for entry in data[candidate]
        #                          if get(entry, "bitscore") == max_bitscore[candidate])
        max_bitscore_genes = set(
            entry["ann"].get_attr("ID", fmt=str) for entry in data[candidate]
            if entry["bitscore"] == max_bitscore[candidate])
        if max_bitscore_genes.issubset(
                genes):  ## if max score genes are subset of target genes
            continue
        else:
            if max_bitscore_genes.isdisjoint(
                    genes):  ## if no target genes have max score
                throw.append(candidate)
            else:  ## if overlapping but not subset
                if relax:
                    printi((
                        f"Warning: candidate target '{candidate}' has hit(s) with bitscore"
                        f" {max_bitscore[candidate]} that overlap(s) with target gene(s)"
                        f" {genes & max_bitscore_genes} and non-target gene(s)"
                        f" {max_bitscore_genes - genes}."
                        " This sequence will be retained as 'relax' has been set to True."
                    ))
                else:
                    throw.append(candidate)
                    printi((
                        f"Warning: candidate target '{candidate}' has hit(s) with bitscore"
                        f" {max_bitscore[candidate]} that overlap(s) with target gene(s)"
                        f" {genes & max_bitscore_genes} and non-target gene(s)"
                        f" {max_bitscore_genes - genes}."
                        " This sequence will be removed from the list of candidate targets"
                        " as 'relax' has been set to False."))
    ## read original candidate targets, filter, and write
    seqs = fasta_to_dict(fasta)
    dict_to_fasta(
        {seq_id: seq
         for seq_id, seq in seqs.items() if seq_id not in throw}, fasta)
    return