Ejemplo n.º 1
0
def write_gene(txt, info, options):
    "Write gene table to TXT"
    #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation
    txt.write("\t".join([
        "gene ID", "gene start", "gene end", "gene strand", "smCOG",
        "locus_tag", "annotation"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        for cds in cluster_gene_features:
            gene_id = utils.get_gene_acc(cds).partition(".")[0]
            cds_start = str(cds.location.start)
            cds_end = str(cds.location.end)
            if cds.strand == 1:
                cds_strand = "+"
            else:
                cds_strand = "-"
            smCOG = ""  ##Not used for now
            locus_tag = utils.get_gene_id(cds).partition(".")[0]
            annotation = utils.get_gene_annotation(cds)
            txt.write("\t".join([
                gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag,
                annotation
            ]) + "\n")
Ejemplo n.º 2
0
def fastaseqlengths(seq_record):
    seqlengths = {}
    cdsfeatures = utils.get_cds_features(seq_record)
    for cds in cdsfeatures:
        seqlength = len(str(utils.get_aa_sequence(cds)))
        seqlengths[utils.get_gene_acc(cds)] = seqlength
    return seqlengths
Ejemplo n.º 3
0
def load_clusterblast_database(seq_record, searchtype="general"):
    accessiondict = {}
    for cds in utils.get_cds_features(seq_record):
        accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds)
    clusters = load_geneclusters(searchtype)
    proteins = load_geneclusterproteins(accessiondict, searchtype)
    return clusters, proteins
Ejemplo n.º 4
0
def load_clusterblast_database(seq_record, searchtype="general"):
    options = config.get_config()
    accessiondict = {}
    for cds in utils.get_cds_features(seq_record):
        accessiondict[utils.get_gene_acc(cds)] = utils.get_gene_accession(cds)
    clusters = load_geneclusters(searchtype)
    proteinlocations, proteinstrands, proteinannotations, proteintags = load_geneclusterproteins(accessiondict, searchtype)
    return clusters, proteinlocations, proteinstrands, proteinannotations, proteintags
Ejemplo n.º 5
0
def perform_subclusterblast(options, seq_record, clusters, proteinlocations,
                            proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running NCBI BLAST+ subcluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True):
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "subclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "subclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:
                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                write_clusterblast_inputfiles(options, queryclusternames,
                                              queryclusterseqs)
                run_clusterblast_processes(options, searchtype="subclusters")
                blastoutput = read_clusterblast_output(options)
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="subclusters")
                logging.info("   Blast search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                subclusterblastStorage = utils.Storage()
                subclusterblastStorage.clusternumber = clusternumber
                subclusterblastStorage.queryclusterprots = queryclusterprots
                subclusterblastStorage.clusters = clusters
                subclusterblastStorage.hitclusterdata = hitclusterdata
                subclusterblastStorage.rankedclusters = rankedclusters
                subclusterblastStorage.rankedclustervalues = rankedclustervalues
                subclusterblastStorage.proteintags = proteintags
                subclusterblastStorage.proteinlocations = proteinlocations
                subclusterblastStorage.proteinannotations = proteinannotations
                subclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          subclusterblastStorage,
                                          searchtype="subclusters")
Ejemplo n.º 6
0
def write_RiPP(txt, info, options):
    "Write RiPP table to TXT"
    #TXT columns: RiPP ID, annotation, core peptide, mol weight, monoisotopic_mass, alt mol weights, nr bridges
    txt.write("\t".join([
        "RiPP ID", "annotation", "core peptide", "molecular weight",
        "monoisotopic_mass", "alternative molecular weights",
        "number of bridges"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        RiPP_features = _find_core_peptides(cluster_feature, info.seq_record)
        RiPPs = []
        for peptide in RiPP_features:
            for cds in cluster_gene_features:
                if utils.features_overlap(cds, peptide):
                    RiPPs.append(utils.get_gene_acc(cds).partition(".")[0])
                    break
        idx = 0
        for RiPP in RiPP_features:
            RiPP_ID = RiPPs[idx]
            note_quals = RiPP.qualifiers['note']
            annotation = [
                qual.partition("predicted class: ")[2] for qual in note_quals
                if "predicted class:" in qual
            ][0]
            core_peptide = [
                qual.partition("predicted core seq: ")[2]
                for qual in note_quals if "predicted core seq:" in qual
            ][0]
            mol_weight = [
                qual.partition("molecular weight: ")[2] for qual in note_quals
                if "molecular weight: " in qual
            ][0]
            monoiso_mass = [
                qual.partition("monoisotopic mass: ")[2] for qual in note_quals
                if "monoisotopic mass: " in qual
            ][0]
            if "alternative weights" in note_quals:
                alt_mol_weights = [
                    qual.partition("alternative weights: ")[2].replace(
                        " ", "") for qual in note_quals
                    if "alternative weights:" in qual
                ][0]
            else:
                alt_mol_weights = ""
            nr_bridges = [
                qual.partition("number of bridges: ")[2] for qual in note_quals
                if "number of bridges: " in qual
            ][0]
            txt.write("\t".join([
                RiPP_ID, annotation, core_peptide, mol_weight, monoiso_mass,
                alt_mol_weights, nr_bridges
            ]) + "\n")
            idx += 1
Ejemplo n.º 7
0
def parse_all_clusters(blasttext, minseqcoverage, minpercidentity, seq_record):
    """ Parses blast results, groups into results by cluster number

        blasttext: the output from diamond in blast format
        minseqcoverage: the exclusive lower bound of sequence coverage for a match
        minpercidentity: the exclusive lower bound of identity similarity for a match
        seq_record: used to get all gene ids in the cluster, and used as a
                backup to fetch sequence length if missing from seqlengths
    """
    seqlengths = fastaseqlengths(seq_record)
    geneclustergenes = [
        utils.get_gene_acc(cds)
        for cds in utils.get_withincluster_cds_features(seq_record)
    ]
    queries = OrderedDict()
    clusters = OrderedDict()
    blastlines = uniqueblasthitfilter(
        [line.split("\t") for line in blasttext.rstrip().split("\n")])
    current_query = None
    queries_by_cluster_number = {}
    clusters_by_query_cluster_number = {}

    for tabs in blastlines:
        query = tabs[0]
        subject = parse_subject(tabs, seqlengths, geneclustergenes, seq_record)

        # only process the pairing if limits met
        if subject.perc_ident <= minpercidentity \
                or subject.perc_coverage <= minseqcoverage:
            continue

        new_query = query not in queries

        if new_query:
            current_query = Query(query, len(queries))
            cluster_number = current_query.cluster_number
            # is it a new cluster number? if so, reset collections
            if cluster_number not in queries_by_cluster_number:
                queries = OrderedDict()
                clusters = OrderedDict()
                # reset query index, since we started a new collection
                current_query.index = 0
                # link them
                queries_by_cluster_number[cluster_number] = queries
                clusters_by_query_cluster_number[cluster_number] = clusters
            # finally, add the query to the current tracker
            queries[query] = current_query

        if subject.genecluster not in clusters:
            clusters[subject.genecluster] = []
        clusters[subject.genecluster].append(current_query)

        # link the subject to the query
        current_query.add_subject(subject)

    return clusters_by_query_cluster_number, queries_by_cluster_number
Ejemplo n.º 8
0
def create_blast_inputs(genecluster, seq_record):
    #Create input fasta files for BLAST search
    queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.nofuzzy_start) + "-" + \
                             str(cds.location.nofuzzy_end), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
Ejemplo n.º 9
0
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND gene cluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"):
                logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast"  + os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record)
                utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta")
                if options.taxon == "plants":
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options)
                else:
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options)
                if retcode != 0:
                    logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)
                out, err, retcode = convert_to_tabular(tempdir)
                if retcode != 0:
                    logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)

                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()

                write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput)
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 10
                minpercidentity = 30
                blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object
                clusterblastStorage = utils.Storage()
                clusterblastStorage.clusternumber = clusternumber
                clusterblastStorage.queryclusterprots = queryclusterprots
                clusterblastStorage.clusters = clusters
                clusterblastStorage.hitclusterdata = hitclusterdata
                clusterblastStorage.rankedclusters = rankedclusters
                clusterblastStorage.rankedclustervalues = rankedclustervalues
                clusterblastStorage.proteintags = proteintags
                clusterblastStorage.proteinlocations = proteinlocations
                clusterblastStorage.proteinannotations = proteinannotations
                clusterblastStorage.proteinstrands = proteinstrands


                #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands)
                write_clusterblast_output(options, seq_record, clusterblastStorage)
Ejemplo n.º 10
0
def create_blast_inputs(genecluster, seq_record):
    options = config.get_config()
    #Create input fasta files for BLAST search
    if options.taxon == "plants":
        queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record))
    else:
        queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.start).replace(">","").replace("<","") + "-" + \
                             str(cds.location.end).replace(">","").replace("<",""), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
Ejemplo n.º 11
0
def write(seq_records, options):
    logging.debug("Exporting antiSMASH information as txt tables")
    #Don't store TXT tables for protein input
    if options.input_type == 'prot':
        return
    #Localize output folder, create TXT subdirectory
    txt_outfolder = options.full_outputfolder_path + os.sep + "txt"
    if not os.path.exists(txt_outfolder):
        os.mkdir(txt_outfolder)
    #Define table names
    tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable"
    #For each gene cluster, write out info to TXT files
    for seq_record in seq_records:
        if len(utils.get_cluster_features(seq_record)) > 0:
            #Open up TXT files
            txt_files = {}
            for table in tables:
                txt_files[table] = open(
                    path.join(
                        txt_outfolder, "%s_%s.txt" %
                        (seq_record.id.partition(".")[0], table)), "w")
            #Gather all information
            info = utils.Storage()
            info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, []
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clusternr = utils.get_cluster_number(cluster)
                info.clusternrs.append(clusternr)
                info.clustertypes[clusternr] = utils.get_cluster_type(cluster)
                info.clustergenes[clusternr] = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.accessions[clusternr] = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.cdsmotifs[clusternr] = utils.get_all_features_of_type(
                    seq_record, ["CDS_motif"])
            info.seq_record = seq_record
            #Write information to tables
            for table in tables:
                getattr(write_tables, 'write_' + table)(txt_files[table], info,
                                                        options)
            for table in tables:
                txt_files[table].close()
Ejemplo n.º 12
0
def blastparse(blasttext, minseqcoverage, minpercidentity, seq_record):
    """ blasttext: the output from diamond in blast format
        minseqcoverage: the exclusive lower bound of sequence coverage for a match
        minpercidentity: the exclusive lower bound of identity similarity for a match
        seq_record: used to get all gene ids in the cluster, and used as a
                backup to fetch sequence length if missing from seqlengths
    """
    seqlengths = fastaseqlengths(seq_record)
    geneclustergenes = [
        utils.get_gene_acc(cds)
        for cds in utils.get_withincluster_cds_features(seq_record)
    ]
    queries = OrderedDict()
    clusters = OrderedDict()
    blastlines = uniqueblasthitfilter(
        [line.split("\t") for line in blasttext.rstrip().split("\n")])
    current_query = None

    for tabs in blastlines:
        query = tabs[0]
        subject = parse_subject(tabs, seqlengths, geneclustergenes, seq_record)

        # only process the pairing if limits met
        if subject.perc_ident <= minpercidentity \
                or subject.perc_coverage <= minseqcoverage:
            continue

        new_query = query not in queries
        new_hit = subject.genecluster not in clusters

        if new_query:
            current_query = Query(query, len(queries))
            queries[query] = current_query

        if new_hit:
            clusters[subject.genecluster] = []
        clusters[subject.genecluster].append(current_query)

        # link the subject to the query
        current_query.add_subject(subject)

    return queries, clusters
Ejemplo n.º 13
0
def write_signature_gene_info(txt, info, options):
    "Write signature gene table to TXT"
    #TXT columns: signature_gene, pHMM_hit, e-value, bit score, nr of seeds
    txt.write("\t".join([
        "signature gene", "pHMM hits", "e-value", "bit score",
        "number of seeds"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        signature_genes = [
            cds for cds in cluster_gene_features if 'sec_met' in cds.qualifiers
        ]
        for cds in signature_genes:
            if len([
                    qual for qual in cds.qualifiers['sec_met']
                    if qual.startswith('Domains detected: ')
            ]) == 0:
                continue
            gene_ID = utils.get_gene_acc(cds).partition(".")[0]
            domdetect_qual = [
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('Domains detected: ')
            ][0]
            if ";" in domdetect_qual:
                domains = domdetect_qual.partition(
                    "Domains detected: ")[2].split(";")
            else:
                domains = [domdetect_qual.partition("Domains detected: ")[2]]
            for domain in domains:
                domain_name = domain.partition(" (")[0].replace(" ", "")
                evalue = domain.partition("E-value: ")[2].partition(",")[0]
                bitscore = domain.partition("bitscore: ")[2].partition(",")[0]
                nr_seeds = domain.partition("seeds: ")[2].partition(")")[0]
                txt.write("\t".join(
                    [gene_ID, domain_name, evalue, bitscore, nr_seeds]) + "\n")
Ejemplo n.º 14
0
def perform_clusterblast(options, seq_record, clusters, proteins):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    debug_path = os.path.abspath(
        os.path.join(options.dbgclusterblast, "clusterblastoutput.txt"))
    with TemporaryDirectory(change=True) as tempdir:
        all_names, all_seqs, all_prots = [], [], []
        prots_by_cluster = []
        for genecluster in geneclusters:
            names, seqs, prots = create_blast_inputs(genecluster, seq_record)
            all_names.extend(names)
            all_seqs.extend(seqs)
            all_prots.extend(prots)
            prots_by_cluster.append(prots)
        if options.dbgclusterblast and os.path.exists(debug_path):
            logging.debug(
                "Skipping DIAMOND calculations, using results from %s instead",
                debug_path)
            with open(debug_path, "r") as fh:
                blastoutput = fh.read()
            logging.debug("    Parsing results from given file...")
        else:
            logging.debug("Running DIAMOND gene cluster search..")
            utils.writefasta(all_names, all_seqs, "input.fasta")
            out, err, retcode = run_diamond(
                "input.fasta",
                path.join(options.clusterblastdir, "geneclusterprots"),
                tempdir, options)
            if retcode != 0:
                logging.error(
                    "Running diamond failed: returned %s, stderr: %r, stdout: %r",
                    retcode, err, out)
            logging.debug("   DIAMOND search finished. Parsing results...")

            with open("input.out", 'r') as fh:
                blastoutput = fh.read()

        write_raw_clusterblastoutput(options.full_outputfolder_path,
                                     blastoutput)

        minseqcoverage = 10
        minpercidentity = 30
        clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage,
                                                   minpercidentity, seq_record)

        clusterblastStorage = utils.Storage()
        clusterblastStorage.clusters = clusters
        clusterblastStorage.proteins = proteins

        for genecluster, queryclusterprots in zip(geneclusters,
                                                  prots_by_cluster):
            clusternumber = utils.get_cluster_number(genecluster)
            cluster_names_to_queries = clusters_by_number.get(
                clusternumber, {})
            allcoregenes = [
                utils.get_gene_acc(cds)
                for cds in utils.get_secmet_cds_features(seq_record)
            ]
            ranking = score_clusterblast_output(clusters, allcoregenes,
                                                cluster_names_to_queries)

            # store all clusterblast related data in a utils.Storage object
            clusterblastStorage.clusternumber = clusternumber
            clusterblastStorage.queryclusterprots = queryclusterprots
            clusterblastStorage.ranking = ranking

            write_clusterblast_output(options, seq_record, clusterblastStorage)
Ejemplo n.º 15
0
def blastparse(blasttext, minseqcoverage, minpercidentity, seqlengths, seq_record):
    options = config.get_config()
    geneclustergenes = [utils.get_gene_acc(cds) for cds in utils.get_withincluster_cds_features(seq_record)]
    blastdict = {}
    querylist = []
    hitclusters = []
    blastlines = blasttext.split("\n")[:-1]
    blastlines = uniqueblasthitfilter(blastlines)
    blastlines = tresholdblasthitfilter(blastlines, minseqcoverage, minpercidentity, seqlengths, seq_record)
    #Goes through the blastlines. For each query, creates a querydict and hitlist, and adds these to the blastdict when finding the next query
    firstquery = "y"
    percid_per_cluster = {}
    for i in blastlines:
        tabs = i.split("\t")
        query = tabs[0]
        subject = tabs[1].split("|")[4]
        if subject == "no_locus_tag":
            subject = tabs[1].split("|")[6]
        if subject in geneclustergenes:
            subject = "h_" + subject
        if len(tabs[1].split("|")) > 6:
            locustag = tabs[1].split("|")[6]
        else:
            locustag = ""
        subject_genecluster = tabs[1].split("|")[0] + "_" + tabs[1].split("|")[1]
        subject_start = (tabs[1].split("|")[2]).split("-")[0]
        subject_end = (tabs[1].split("|")[2]).split("-")[1]
        subject_strand  = tabs[1].split("|")[3]
        subject_annotation = tabs[1].split("|")[5]
        perc_ident = int(float(tabs[2]) + 0.5)
        evalue = str(tabs[10])
        blastscore = int(float(tabs[11])+0.5)
        if seqlengths.has_key(query.split("|")[4]):
            perc_coverage = (float(tabs[3]) / seqlengths[query.split("|")[4]]) * 100
        else:
            feature_by_id = utils.get_feature_dict_protein_id(seq_record)
            seqlength = len(utils.get_aa_sequence(feature_by_id[query.split("|")[4]]))
            perc_coverage = (float(tabs[3]) / seqlength) * 100
        if firstquery == "y": #Only until the first blastline with good hit
            firstquery = "n"
            querylist.append(query)
            subjectlist = []
            querydict = {}
            subjectlist.append(subject)
            querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
            if subject_genecluster not in hitclusters:
                percid_per_cluster[subject_genecluster] = [perc_ident]
                hitclusters.append(subject_genecluster)
            last_query = query
        elif i == blastlines[-1]: #Only for the last blastline
            if query not in querylist:
                subjectlist = []
                querydict = {}
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                blastdict[query] = [subjectlist,querydict]
                querylist.append(query)
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
            else:
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                blastdict[query] = [subjectlist,querydict]
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
        else: #For all but the first and last blastlines
            if query not in querylist:
                blastdict[last_query] = [subjectlist,querydict]
                querylist.append(query)
                subjectlist = []
                querydict = {}
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
                last_query = query
            else:
                subjectlist.append(subject)
                querydict[subject] = [subject_genecluster,subject_start,subject_end,subject_strand,subject_annotation,perc_ident,blastscore,perc_coverage,evalue,locustag]
                if subject_genecluster not in hitclusters:
                    hitclusters.append(subject_genecluster)
                    percid_per_cluster[subject_genecluster] = [perc_ident]
                else:
                    percid_per_cluster[subject_genecluster].append(perc_ident)
    #For plants, filter hitclusters to only keep those hits with at least one hit > 60% ID
    if options.taxon == "plants":
        hitclusters = [cluster for cluster in hitclusters if len([int(pid) for pid in percid_per_cluster[cluster] if int(pid) > 60]) > 0]
    return [blastdict,querylist,hitclusters]
Ejemplo n.º 16
0
def write_NRPS_PKS(txt, info, options):
    "Write NRPS/PKS table to TXT"
    #TXT columns: NRPS/PKS ID, annotation, aSDomain, score, evalue, domain type, subtype, range, activity, NRPSPredictor2, Stachelhaus, Minowa, pkssignature, consensus
    txt.write("\t".join([
        "Cluster_ID", "NRPSPKS_ID", "annotation", "aSDomain", "score",
        "evalue", "domain_type", "subtype", "domain_start", "domain_end",
        "KR activity", "KR stereochemistry", "NRPSPredictor2", "Stachelhaus",
        "Minowa", "pkssignature", "consensus"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        cluster_id = "{seq_id}_c{cluster_nr}".format(seq_id=info.seq_record.id,
                                                     cluster_nr=BGCnr)
        NRPSs_PKSs = [
            cds for cds in cluster_gene_features
            if 'sec_met' in cds.qualifiers and len([
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('NRPS/PKS Domain:')
            ]) > 0
        ]
        for cds in NRPSs_PKSs:
            enzyme_ID = utils.get_gene_acc(cds).partition(".")[0]
            if len([
                    qual for qual in cds.qualifiers['sec_met']
                    if "NRPS/PKS subtype: " in qual
            ]) > 0:
                enzyme_annotation = [
                    qual for qual in cds.qualifiers['sec_met']
                    if qual.startswith("NRPS/PKS subtype")
                ][0].partition("NRPS/PKS subtype: ")[2]
            else:
                logging.warn("No enzyme annotation for %s" % enzyme_ID)
                enzyme_annotation = ""
            aSDomains = [
                dom for dom in utils.get_cluster_aSDomain_features(
                    cluster_feature, info.seq_record) if
                utils.features_overlap(cds, dom) and utils.get_gene_id(cds) in
                [dom.qualifiers['locus_tag'], dom.qualifiers['locus_tag'][0]]
            ]
            for aSDomain in aSDomains:
                domtype = aSDomain.qualifiers['domain'][0]
                if "domain_subtype" in aSDomain.qualifiers:
                    subtype = aSDomain.qualifiers['domain_subtype'][0]
                else:
                    subtype = ""
                aSDomain_ID = aSDomain.qualifiers['asDomain_id'][0]
                score = str(aSDomain.qualifiers['score'][0])
                evalue = str(aSDomain.qualifiers['evalue'][0])
                dom_start = str(aSDomain.location.start)
                dom_end = str(aSDomain.location.end)
                kr_activity = ""
                kr_stereochemistry = ""
                NRPSPredictor2 = ""
                Stachelhaus = ""
                Minowa = ""
                pkssignature = ""
                consensus = ""
                if aSDomain.qualifiers.has_key('specificity'):
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR activity")
                    ]) > 0:
                        kr_activity = [
                            qual.partition("KR activity: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR activity")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR stereochemistry")
                    ]) > 0:
                        kr_stereochemistry = [
                            qual.partition("KR stereochemistry: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("KR stereochemistry")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("NRPSpredictor2")
                    ]) > 0:
                        NRPSPredictor2 = [
                            qual.partition("NRPSpredictor2 SVM: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("NRPSpredictor2")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Stachelhaus")
                    ]) > 0:
                        Stachelhaus = [
                            qual.partition("Stachelhaus code: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Stachelhaus")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Minowa")
                    ]) > 0:
                        Minowa = [
                            qual.partition("Minowa: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("Minowa")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("PKS signature")
                    ]) > 0:
                        pkssignature = [
                            qual.partition("PKS signature: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("PKS signature")
                        ][0]
                    if len([
                            qual for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("consensus")
                    ]) > 0:
                        consensus = [
                            qual.partition("consensus: ")[2]
                            for qual in aSDomain.qualifiers['specificity']
                            if qual.startswith("consensus")
                        ][0]

                txt.write("\t".join([
                    cluster_id, enzyme_ID, enzyme_annotation, aSDomain_ID,
                    score, evalue, domtype, subtype, dom_start, dom_end,
                    kr_activity, kr_stereochemistry, NRPSPredictor2,
                    Stachelhaus, Minowa, pkssignature, consensus
                ]) + "\n")
Ejemplo n.º 17
0
def write_BGC(txt, info, options):
    "Write BGC table to TXT"
    #TXT columns: BGC ID, BGC_type, detection_rules_used, BGC_range, genes, subclusters,
    # NRPSs_PKSs, signature_genes, RiPPs, pred_structure, monomers
    txt.write("\t".join([
        "BGC ID", "BGC type", "detection rules used", "BGC_range", "genes",
        "subclusters", "NRPSs/PKSs", "signature_genes", "RiPPs",
        "predicted structure", "monomers"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        BGC_ID = "%s_c%s" % (info.seq_record.id.partition(".")[0], BGCnr)
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        BGC_type = info.clustertypes[BGCnr].replace("-", ";")
        detection_rules_used = '"' + ";".join(
            get_detection_rules(cluster_feature)) + '"'
        BGC_range = ";".join([
            str(cluster_feature.location.start),
            str(cluster_feature.location.end)
        ])
        genes = ";".join(info.accessions[BGCnr])
        if 'subclusterblast' in cluster_feature.qualifiers:
            subclusters = ";".join([
                qual.partition("\t")[2]
                for qual in cluster_feature.qualifiers['subclusterblast']
            ])
        else:
            subclusters = ""
        #TODO The subclusterblast module should probably be changed for the precalcs to provide a list here of the 100% hits instead of all hits
        NRPSs_PKSs = ";".join([
            utils.get_gene_acc(cds).partition(".")[0]
            for cds in cluster_gene_features
            if 'sec_met' in cds.qualifiers and len([
                qual for qual in cds.qualifiers['sec_met']
                if qual.startswith('NRPS/PKS Domain:')
            ]) > 0
        ])
        signature_genes = ";".join([
            utils.get_gene_acc(cds).partition(".")[0]
            for cds in cluster_gene_features if 'sec_met' in cds.qualifiers
        ])
        if len(_find_core_peptides(cluster_feature, info.seq_record)) != 0:
            ripp_list = []
            for peptide in _find_core_peptides(cluster_feature,
                                               info.seq_record):
                for cds in cluster_gene_features:
                    if utils.features_overlap(cds, peptide):
                        ripp_list.append(
                            utils.get_gene_acc(cds).partition(".")[0])
                        break
#            RiPPs = ";".join([[utils.get_gene_acc(cds).partition(".")[0] for cds in cluster_gene_features
#                if utils.features_overlap(cds, peptide)][0] for peptide in
#               _find_core_peptides(cluster_feature, info.seq_record)])
            RiPPs = ";".join(ripp_list)
        else:
            RiPPs = "-"
        if 'structure' in cluster_feature.qualifiers:
            pred_structure = ";".join(cluster_feature.qualifiers['structure'])
        else:
            pred_structure = "N/A"
        monomers = utils.get_structure_pred(cluster_feature)
        #Write data to TXT
        txt.write("\t".join([
            BGC_ID, BGC_type, detection_rules_used, BGC_range, genes,
            subclusters, NRPSs_PKSs, signature_genes, RiPPs, pred_structure,
            monomers
        ]) + "\n")
Ejemplo n.º 18
0
def get_description(record, feature, type_, options):
    "Get the description text of a feature"

    replacements = {
        'locus_tag': ", ".join(feature.qualifiers.get('locus_tag', ['-'])),
        'protein_id': ", ".join(feature.qualifiers.get('protein_id', ['-'])),
        'smcog': '-',
        'ecnumber': '-',
        'transport_blast_line': '',
        'smcog_tree_line': '',
        'searchgtr_line': '',
        'start': int(feature.location.start) + 1,
        'end': int(feature.location.end),
        'model_details': get_model_details(feature),
        'asf': ''
    }

    blastp_url = "http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=Proteins&" \
                 "PROGRAM=blastp&BLAST_PROGRAMS=blastp&QUERY=%s&" \
                 "LINK_LOC=protein&PAGE_TYPE=BlastSearch"
    genomic_context_url = "http://www.ncbi.nlm.nih.gov/projects/sviewer/?" \
                          "Db=gene&DbFrom=protein&Cmd=Link&noslider=1&"\
                          "id=%s&from=%s&to=%s"
    template = '<span class="svgene-tooltip-bold">%(product)s</span><br>\n'
    template += 'Locus-tag: %(locus_tag)s; Protein-ID: %(protein_id)s<br>\n'
    if 'EC_number' in feature.qualifiers:
        template += "EC-number(s): %(ecnumber)s<br>\n"
    if options.smcogs:
        template += "smCOG: %(smcog)s<br>\n"
    if options.input_type == 'nucl':
        template += "Location: %(start)s - %(end)s<br><br>\n"
    if 'sec_met' in feature.qualifiers:
        template += '<span class="bold">Signature pHMM hits:</span><br>\n%(model_details)s<br>\n'

    if options.knownclusterblast:

        mibig_homology_path = glob(
            os.path.join(options.full_outputfolder_path, "knownclusterblast",
                         "cluster*",
                         utils.get_gene_acc(feature) + '_mibig_hits.txt'))
        if mibig_homology_path:
            mibig_homology_file = mibig_homology_path[0]
            generate_html_table(mibig_homology_file)
            html_file = mibig_homology_file.split('.txt')[0] + '.html'
            replacements['mibig_homology_path'] = html_file[
                len(options.full_outputfolder_path) + 1:]
            template += '<a href="%(mibig_homology_path)s" target="_new">MiBIG Hits</a><br><br>\n'
    template += """
%(transport_blast_line)s
%(searchgtr_line)s
<a href="%(blastp_url)s" target="_new">NCBI BlastP on this gene</a><br>
<a href="%(genomic_context_url)s" target="_new">View genomic context</a><br>
%(smcog_tree_line)s<br>"""
    if not get_ASF_predictions(feature) == "":
        template += '<span class="bold">Active Site Finder results:</span><br>\n%(asf)s<br><br>\n'
    template += """AA sequence: <a href="javascript:copyToClipboard('%(sequence)s')">Copy to clipboard</a><br>"""

    if not options.smcogs:
        del replacements['smcog']
    if options.input_type == 'prot':
        del replacements['start']
        del replacements['end']

    replacements['product'] = feature.qualifiers.get('product', ['-'])[0]
    if 'translation' in feature.qualifiers:
        sequence = feature.qualifiers['translation'][0]
    else:
        sequence = str(utils.get_aa_sequence(feature))
    replacements['blastp_url'] = blastp_url % sequence
    replacements['sequence'] = sequence
    if len(sequence) > 2000:
        len_seq = 30
    else:
        len_seq = (len(sequence) / 80) + 1
    replacements['len_seq'] = len_seq
    replacements['genomic_context_url'] = genomic_context_url % \
                    ( record.id,
                      max(feature.location.start - 9999, 0),
                      min(feature.location.end + 10000, len(record)) )
    if 'EC_number' in feature.qualifiers:
        replacements['ecnumber'] = ", ".join(
            feature.qualifiers.get('EC_number', ['-']))
    else:
        del replacements['ecnumber']

    if options.smcogs:
        for note in feature.qualifiers.get('note', []):
            if note.startswith('smCOG:') and '(' in note:
                text = note[6:].split('(', 1)[0]
                smcog, desc = text.split(':', 1)
                desc = desc.replace('_', ' ')
                replacements['smcog'] = '%s (%s)' % (smcog, desc)
            elif note.startswith('smCOG tree PNG image:'):
                entry = '<a href="%s" target="_new">View smCOG seed phylogenetic tree with this gene</a>'
                url = note.split(':')[-1]
                replacements['smcog_tree_line'] = entry % url

    if type_ == 'transport':
        url = "http://blast.jcvi.org/er-blast/index.cgi?project=transporter;" \
              "program=blastp;database=pub/transporter.pep;" \
              "sequence=sequence%%0A%s" % sequence
        transport_blast_line = '<a href="%s" target="_new">TransportDB BLAST on this gene<br>' % url
        replacements['transport_blast_line'] = transport_blast_line

    if options.searchgtr_links.has_key(record.id + "_" +
                                       utils.get_gene_id(feature)):
        url = options.searchgtr_links[record.id + "_" +
                                      utils.get_gene_id(feature)]
        searchgtr_line = '<a href="%s" target="_new">SEARCHGTr on this gene<br>' % url
        replacements['searchgtr_line'] = searchgtr_line
    replacements['asf'] = get_ASF_predictions(feature)
    if replacements['asf'] == "":
        del replacements['asf']

    return template % replacements
Ejemplo n.º 19
0
def write(seq_records, options):
    if options.input_type == 'prot':
        return
    #Open up TXT file and XLS record
    outfolder = options.full_outputfolder_path
    txtfile = open(path.join(outfolder, "geneclusters.txt"), "w")
    wb = Workbook()
    font1 = Font()
    style1 = XFStyle()
    style1.font = font1
    font1.bold = True
    ws0 = wb.add_sheet('0')
    ws0.write(0, 0, "Input accession number", style1)
    ws0.write(0, 1, "Input name", style1)
    ws0.write(0, 2, "Gene cluster type", style1)
    ws0.write(0, 3, "Gene cluster genes", style1)
    ws0.write(0, 4, "Gene cluster gene accessions", style1)
    if options.knownclusterblast:
        ws0.write(0, 5, "Compound with gene cluster of highest homology",
                  style1)
    #For each gene cluster, write out info
    column = 1
    for seq_record in seq_records:
        clusters = utils.get_cluster_features(seq_record)
        for cluster in clusters:
            clustertype = utils.get_cluster_type(cluster)
            clusternr = utils.get_cluster_number(cluster)
            clustergenes = [
                utils.get_gene_id(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            accessions = [
                utils.get_gene_acc(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            ws0.write(column, 0, seq_record.id)
            try:
                ws0.write(column, 1, seq_record.description)
            except:
                ws0.write(
                    column, 1,
                    "Name to long to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            ws0.write(column, 2, clustertype)
            try:
                ws0.write(column, 3, ";".join(clustergenes))
            except:
                ws0.write(
                    column, 3,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            try:
                ws0.write(column, 4, ";".join(accessions))
            except:
                ws0.write(
                    column, 4,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            if hasattr(seq_record, 'closestcompounddict') and \
               seq_record.closestcompounddict.has_key(clusternr):
                ws0.write(column, 5, seq_record.closestcompounddict[clusternr])
            column += 1
            txtfile.write("\t".join([
                seq_record.id, seq_record.description, clustertype, ";".join(
                    clustergenes), ";".join(accessions)
            ]) + "\n")
    wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))