Ejemplo n.º 1
0
def analyse_biosynthetic_order(pksnrpsvars, seq_record, options):
    #Find NRPS/PKS gene clusters
    nrpspksclusters = list(
        set(
            utils.get_cluster_features_of_type(seq_record, "nrps") +
            utils.get_cluster_features_of_type(seq_record, "pks")))
    #Predict biosynthetic gene order in gene cluster using starter domains, thioesterase domains, gene order and docking domains
    if not 'docking' in options:
        options.docking = {}
    for genecluster in nrpspksclusters:
        clusterpksnrpsgenes = find_clusterpksnrpsgenes(
            genecluster, pksnrpsvars.pksnrpscoregenes)
        if len(clusterpksnrpsgenes) > 0:
            pksgenes, clusterpksgenes, nrpsgenes, hybridgenes = find_cluster_modular_enzymes(
                clusterpksnrpsgenes, pksnrpsvars)
            #If more than three PKS genes, use dock_dom_analysis if possible to identify order
            if pksgenes > 3 and pksgenes < 11 and nrpsgenes == 0 and hybridgenes == 0:
                geneorder = perform_docking_domain_analysis(
                    options, clusterpksgenes,
                    utils.get_cluster_number(genecluster), seq_record,
                    pksnrpsvars)
                options.docking[utils.get_cluster_number(genecluster)] = True
            else:
                geneorder = find_colinear_order(clusterpksnrpsgenes,
                                                seq_record,
                                                pksnrpsvars.domainnamesdict)
                options.docking[utils.get_cluster_number(genecluster)] = False
            generate_substrates_order(utils.get_cluster_number(genecluster),
                                      geneorder, pksnrpsvars, seq_record)
Ejemplo n.º 2
0
    def test_get_cluster_number(self):
        "Test utils.get_cluster_number()"
        # should return the actual number when it is present
        self.assertEqual(1, utils.get_cluster_number(self.features[0]))
        self.assertEqual(2, utils.get_cluster_number(self.features[-1]))

        # should return 0 otherwise
        no_number = FakeFeature('cluster', FeatureLocation(23, 42))
        self.assertEqual(0, utils.get_cluster_number(no_number))
Ejemplo n.º 3
0
def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)
        borders = utils.get_cluster_cluster_border_features(cluster, record)

        tta_codons = []
        all_misc_features = utils.get_all_features_of_type(
            record, 'misc_feature')
        for feature in all_misc_features:
            if not utils.features_overlap(cluster, feature):
                continue
            if 'note' not in feature.qualifiers:
                continue

            for note in feature.qualifiers['note']:
                if note.startswith('tta leucine codon'):
                    tta_codons.append(feature)
                    break

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['borders'] = convert_cluster_border_features(borders)
        js_cluster['tta_codons'] = convert_tta_codons(tta_codons)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters
Ejemplo n.º 4
0
def write(seq_records, options):
    """Write all cluster proteins to a file

    Args:
        seq_records (iterable): An iterable containing Bio.SeqRecords
        options (argparse.Namespace): The options passed to the program
    """
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername,
                            "%s_genecluster_proteins.fa" % basename)
    logging.debug("Writing seq_records to %r" % output_name)

    with open(output_name, 'w+') as handle:
        for seq_record in seq_records:
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clustertype = utils.get_cluster_type(cluster)
                clusternr = utils.get_cluster_number(cluster)
                for feature in utils.get_cluster_cds_features(
                        cluster, seq_record):
                    qual = feature.qualifiers
                    fasta_header = '>%s:%s %s #%s - %s\n' % (
                        qual['locus_tag'][0], qual['protein_id'][0],
                        clustertype, clusternr, qual['product'][0])
                    handle.write(fasta_header)
                    handle.write(
                        '%s\n' %
                        '\n'.join(textwrap.wrap(qual['translation'][0], 60)))
Ejemplo n.º 5
0
def load_genecluster_info(seq_record, options, searchtype="general"):
    #Gather and store data on each gene cluster
    smcogdict, smcogdescriptions = utils.get_smcog_annotations(seq_record)
    gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102']
    transportercoglist = [
        'SMCOG1000', 'SMCOG1005', 'SMCOG1011', 'SMCOG1020', 'SMCOG1029',
        'SMCOG1033', 'SMCOG1035', 'SMCOG1044', 'SMCOG1065', 'SMCOG1067',
        'SMCOG1069', 'SMCOG1074', 'SMCOG1085', 'SMCOG1096', 'SMCOG1106',
        'SMCOG1118', 'SMCOG1131', 'SMCOG1166', 'SMCOG1169', 'SMCOG1184',
        'SMCOG1202', 'SMCOG1205', 'SMCOG1214', 'SMCOG1234', 'SMCOG1243',
        'SMCOG1245', 'SMCOG1252', 'SMCOG1254', 'SMCOG1288'
    ]
    seq_record.qgeneclusterdata = {}
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    for genecluster in geneclusters:
        geneclusternr = utils.get_cluster_number(genecluster)
        clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize = retrieve_gene_cluster_annotations(
            seq_record, smcogdict, gtrcoglist, transportercoglist,
            geneclusternr)
        if options.clusterblast:
            hitgeneclusterdata = retrieve_clusterblast_info(
                seq_record, geneclusternr, searchtype=searchtype)
        else:
            hitgeneclusterdata = {}
        pksnrpsprotsnames, pksnrpsdomains, domlist, domsdetails, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred = retrieve_pksnrps_info(
            seq_record, geneclusternr, pksnrpsprots)
        seq_record.qgeneclusterdata[geneclusternr] = [
            clustertype, clustersize, clustergenes, annotations, starts, ends,
            strands, pksnrpsprots, pksnrpsprotsnames, pksnrpsdomains,
            substrspecnrpspredictordict, substrspecminowadict,
            substrspecpkssigdict, substrspecconsensusdict, gtrs, transporters,
            colors, hitgeneclusterdata, structpred, krpredictionsdict
        ]
Ejemplo n.º 6
0
def convert_clusters(record, annotations, options):
    """Convert cluster SeqFeatures to JSON"""
    js_clusters = []
    for cluster in utils.get_cluster_features(record):
        features = utils.get_cluster_cds_features(cluster, record)

        js_cluster = {}
        js_cluster['start'] = int(cluster.location.start) + 1
        js_cluster['end'] = int(cluster.location.end)
        js_cluster['idx'] = utils.get_cluster_number(cluster)
        js_cluster['orfs'] = convert_cds_features(record, features,
                                                  annotations, options)
        js_cluster['type'] = utils.get_cluster_type(cluster)
        if options.coexpress:
            js_cluster["geo"] = utils.get_geotable_json(features)
        if 'probability' in cluster.qualifiers:
            js_cluster['probability'] = cluster.qualifiers['probability'][0]
        if options.input_type == 'prot':
            js_cluster['unordered'] = True
        js_cluster['knowncluster'] = "-"
        js_cluster['BGCid'] = "-"
        js_cluster['domains'] = utils.get_cluster_domains(cluster, record)

        if options.enable_cdhit:
            js_cluster['cdhitclusters'] = utils.get_cluster_cdhit_table(
                cluster, record)

        if 'knownclusterblast' in cluster.qualifiers:
            knownclusters = cluster.qualifiers['knownclusterblast']
            bestcluster = [
                kcluster for kcluster in knownclusters
                if kcluster.startswith('1.')
            ]
            if not len(bestcluster) == 1:
                logging.warning(
                    "Error parsing best knowncluster hit; knownclusters array = %s. Possibly no significant hits to known biosynthetic gene clusters."
                    % str(knownclusters))
            else:
                reObj = re.match('\d+\. (\S+)\t(.*)', bestcluster[0])
                js_cluster['knowncluster'] = reObj.group(2)
                js_cluster['BGCid'] = reObj.group(1)
                logging.debug('Found closest cluster "%s" for cluster no. %s' %
                              (js_cluster['knowncluster'],
                               utils.get_cluster_number(cluster)))
        js_clusters.append(js_cluster)

    return js_clusters
Ejemplo n.º 7
0
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options):
    #Create directory to store structures
    options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures"))
    if not os.path.exists(options.structuresfolder):
        os.mkdir(options.structuresfolder)
    originaldir = os.getcwd()
    structure_drawing_dir = utils.get_full_path(__file__, '') + os.sep + "NRPeditor"
    os.chdir(structure_drawing_dir)
    #Combine predictions into a prediction of the final chemical structure and generate images
    geneclusters = utils.get_cluster_features(seq_record)
    for genecluster in geneclusters:
        smiles_string = "N/A"
        geneclusternr = utils.get_cluster_number(genecluster)
        if pksnrpsvars.compound_pred_dict.has_key(geneclusternr):
            # if product is ectoine generate predefined SMILE string and generate structure
            if pksnrpsvars.compound_pred_dict[geneclusternr] == "ectoine":
                smiles_string = "CC1=NCCC(N1)C(=O)O"
                smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w")
                smilesfile.write(smiles_string)
                smilesfile.close()
                depictstatus = depict_smile(geneclusternr,options.structuresfolder)
                if depictstatus == "failed":
                    pksnrpsvars.failedstructures.append(geneclusternr)
                elif genecluster in pksnrpsvars.failedstructures:
                    del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)]
            else:
                # use information on peptide / polyketide sequence to gernerate structure image
                residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ")
                nrresidues = len(residues.split(" "))
                if nrresidues > 1:
                    if sys.platform == ('win32') or sys.platform == ('darwin'):
                        structcommand = 'main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
                    elif sys.platform == ('linux2'):
                        structcommand = './main input 100 4000 1000 AA DDV DIM ' + str(nrresidues + 1) + ' "'
                    for i in [res for res in residues.split(" ") if len(res) > 1]:
                        structcommand = structcommand + i + " "
                    structcommand = structcommand + 'TE"'
                    smilesinfo = os.popen(structcommand)
                    smilesinfo = smilesinfo.read()
                    smiles_string = (smilesinfo.split("core peptide: ")[1]).split("\ntermintype")[0]
                    if sys.platform == ('linux2') or sys.platform == ('darwin'):
                        smiles_string.replace("[X]","[*:X]")
                        smiles_string2 = ""
                        a = 1
                        for k in smiles_string:
                            if k == "X":
                                smiles_string2 = smiles_string2 + str(a)
                                a += 1
                            else:
                                smiles_string2 = smiles_string2 + k
                        smiles_string = smiles_string2
                    smilesfile = open("genecluster" + str(geneclusternr) + ".smi","w")
                    smilesfile.write(smiles_string)
                    smilesfile.close()
                    depictstatus = depict_smile(geneclusternr, options.structuresfolder)
                    if depictstatus == "failed":
                        pksnrpsvars.failedstructures.append(geneclusternr)
        _update_sec_met_entry(genecluster, smiles_string)
    os.chdir(originaldir)
Ejemplo n.º 8
0
def _get_transatpks_geneclusters(pksnrpsvars, seq_record):
    nrpspksclusters = list(set(utils.get_cluster_features_of_type(seq_record, "transatpks")))
    genes_in_cluster = {}
    for cluster in nrpspksclusters:
        cluster_id = utils.get_cluster_number(cluster)
        cluster_genes = [utils.get_gene_id(feature) for feature in find_clusterpksnrpsgenes(cluster, pksnrpsvars.pksnrpscoregenes)]
        genes_in_cluster[cluster_id] = cluster_genes
    return genes_in_cluster
Ejemplo n.º 9
0
def perform_subclusterblast(options, seq_record, clusters, proteinlocations,
                            proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running NCBI BLAST+ subcluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True):
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "subclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "subclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:
                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                write_clusterblast_inputfiles(options, queryclusternames,
                                              queryclusterseqs)
                run_clusterblast_processes(options, searchtype="subclusters")
                blastoutput = read_clusterblast_output(options)
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="subclusters")
                logging.info("   Blast search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                subclusterblastStorage = utils.Storage()
                subclusterblastStorage.clusternumber = clusternumber
                subclusterblastStorage.queryclusterprots = queryclusterprots
                subclusterblastStorage.clusters = clusters
                subclusterblastStorage.hitclusterdata = hitclusterdata
                subclusterblastStorage.rankedclusters = rankedclusters
                subclusterblastStorage.rankedclustervalues = rankedclustervalues
                subclusterblastStorage.proteintags = proteintags
                subclusterblastStorage.proteinlocations = proteinlocations
                subclusterblastStorage.proteinannotations = proteinannotations
                subclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          subclusterblastStorage,
                                          searchtype="subclusters")
Ejemplo n.º 10
0
def load_clusterblast_outputdata(seq_record, options):
    #Read in ClusterBlast data
    seq_record.queryclusterdata = {}
    seq_record.nrhitgeneclusters = {}
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    for genecluster in geneclusters:
        clusternr = utils.get_cluster_number(genecluster)
        details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict = read_clusterblastfile(seq_record, options, clusternr)
        parse_clusterblast_details(options, seq_record, clusternr, details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict)
        genecluster.qualifiers['clusterblast'] = toptenhitclusters
Ejemplo n.º 11
0
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND gene cluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"):
                logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast"  + os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record)
                utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta")
                if options.taxon == "plants":
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options)
                else:
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options)
                if retcode != 0:
                    logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)
                out, err, retcode = convert_to_tabular(tempdir)
                if retcode != 0:
                    logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)

                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()

                write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput)
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 10
                minpercidentity = 30
                blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object
                clusterblastStorage = utils.Storage()
                clusterblastStorage.clusternumber = clusternumber
                clusterblastStorage.queryclusterprots = queryclusterprots
                clusterblastStorage.clusters = clusters
                clusterblastStorage.hitclusterdata = hitclusterdata
                clusterblastStorage.rankedclusters = rankedclusters
                clusterblastStorage.rankedclustervalues = rankedclustervalues
                clusterblastStorage.proteintags = proteintags
                clusterblastStorage.proteinlocations = proteinlocations
                clusterblastStorage.proteinannotations = proteinannotations
                clusterblastStorage.proteinstrands = proteinstrands


                #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands)
                write_clusterblast_output(options, seq_record, clusterblastStorage)
Ejemplo n.º 12
0
def internal_homology_blast(seq_record):
    options = config.get_config()
    #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
    with TemporaryDirectory(change=True):
        logging.info("Finding internal homologs in each gene cluster..")
        internalhomologygroupsdict = {}
        geneclusters = utils.get_sorted_cluster_features(seq_record)
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            iqueryclusternames, iqueryclusterseqs, iqueryclusterprots = create_blast_inputs(genecluster, seq_record)
            utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta")
            blastoutput = run_internal_blastsearch()
            iblastdict, iquerylist, ihitclusters = parse_blast(blastoutput, seq_record, 25, 30)
            internalhomologygroupsdict = find_internal_orthologous_groups(internalhomologygroupsdict, iblastdict, iqueryclusternames, clusternumber)
    return internalhomologygroupsdict
Ejemplo n.º 13
0
def write(seq_records, options):
    logging.debug("Exporting antiSMASH information as txt tables")
    #Don't store TXT tables for protein input
    if options.input_type == 'prot':
        return
    #Localize output folder, create TXT subdirectory
    txt_outfolder = options.full_outputfolder_path + os.sep + "txt"
    if not os.path.exists(txt_outfolder):
        os.mkdir(txt_outfolder)
    #Define table names
    tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable"
    #For each gene cluster, write out info to TXT files
    for seq_record in seq_records:
        if len(utils.get_cluster_features(seq_record)) > 0:
            #Open up TXT files
            txt_files = {}
            for table in tables:
                txt_files[table] = open(
                    path.join(
                        txt_outfolder, "%s_%s.txt" %
                        (seq_record.id.partition(".")[0], table)), "w")
            #Gather all information
            info = utils.Storage()
            info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, []
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clusternr = utils.get_cluster_number(cluster)
                info.clusternrs.append(clusternr)
                info.clustertypes[clusternr] = utils.get_cluster_type(cluster)
                info.clustergenes[clusternr] = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.accessions[clusternr] = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.cdsmotifs[clusternr] = utils.get_all_features_of_type(
                    seq_record, ["CDS_motif"])
            info.seq_record = seq_record
            #Write information to tables
            for table in tables:
                getattr(write_tables, 'write_' + table)(txt_files[table], info,
                                                        options)
            for table in tables:
                txt_files[table].close()
Ejemplo n.º 14
0
def generate_structure_images(seq_records, options):
    "Generate the structure images based on Monomers prediction in cluster feature"
    
    for seq_record in seq_records:
        # Ugly temporary solution:
        # At first we have to regenerate the relevant information for the pksnrpsvars dictionary from the seq_record file
        pksnrpsvars = utils.Storage()
        pksnrpsvars.compound_pred_dict = {}
        pksnrpsvars.failedstructures = []
        
        geneclusters = utils.get_cluster_features(seq_record)
        
        for genecluster in geneclusters:
            geneclusternr = utils.get_cluster_number(genecluster)
            pksnrpsvars.compound_pred_dict[geneclusternr] = utils.get_structure_pred(genecluster)
        if len(pksnrpsvars.compound_pred_dict) > 0:
            generate_chemical_structure_preds(pksnrpsvars, seq_record, options)
Ejemplo n.º 15
0
def internal_homology_blast(seq_record):
    #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
    with TemporaryDirectory(change=True):
        logging.debug("Finding internal homologs in each gene cluster..")
        internalhomologygroups = {}
        geneclusters = utils.get_sorted_cluster_features(seq_record)
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            iqueryclusternames, iqueryclusterseqs, _ = create_blast_inputs(
                genecluster, seq_record)
            utils.writefasta(iqueryclusternames, iqueryclusterseqs,
                             "internal_input.fasta")
            blastoutput = run_internal_blastsearch()
            queries, _ = blastparse(blastoutput, 25, 30, seq_record)
            groups = find_internal_orthologous_groups(queries,
                                                      iqueryclusternames)
            internalhomologygroups[clusternumber] = groups
    return internalhomologygroups
Ejemplo n.º 16
0
def create_blast_inputs(genecluster, seq_record):
    #Create input fasta files for BLAST search
    queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.nofuzzy_start) + "-" + \
                             str(cds.location.nofuzzy_end), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
Ejemplo n.º 17
0
def write(seq_records, options):
    """Write all cluster proteins to a file

    Args:
        seq_records (iterable): An iterable containing Bio.SeqRecords
        options (argparse.Namespace): The options passed to the program
    """
    basename = seq_records[0].id
    output_name = path.join(options.outputfoldername, "%s_genecluster_proteins.fa" % basename)
    logging.debug("Writing seq_records to %r" % output_name)

    with open(output_name, 'w+') as handle:
        for seq_record in seq_records:
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clustertype = utils.get_cluster_type(cluster)
                clusternr = utils.get_cluster_number(cluster)
                for feature in utils.get_cluster_cds_features(cluster, seq_record):
                    qual = feature.qualifiers
                    fasta_header = '>%s:%s %s #%s - %s\n' % (qual['locus_tag'][0], qual['protein_id'][0], clustertype, clusternr, qual['product'][0])
                    handle.write( fasta_header )
                    handle.write( '%s\n' % '\n'.join( textwrap.wrap(qual['translation'][0], 60) ) )
Ejemplo n.º 18
0
def mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters,
                           options):

    minseqcoverage = 20
    minpercidentity = 20
    _, queries_by_cluster = clusterblast.parse_all_clusters(
        blastoutput, minseqcoverage, minpercidentity, seq_record)

    for genecluster in geneclusters:
        cluster_number = utils.get_cluster_number(genecluster)
        queries = queries_by_cluster.get(cluster_number, {})

        # Since the BLAST query was only for proteins in the cluster just need to iterate through the keys and generate
        # a file for each of the keys
        outputfolder = os.path.join(options.knownclusterblast_outputfolder,
                                    "cluster{}".format(cluster_number))
        if not os.path.exists(outputfolder):
            os.mkdir(outputfolder)
        for cluster_protein in queries.values():
            protein_name = cluster_protein.id
            with open(outputfolder + os.sep + protein_name + '_mibig_hits.txt',
                      'w') as outfile:
                outfile.write(
                    '#Protein\tDescription\tMiBIG Cluster\tMiBIG Product'
                    '\tPercent ID\tPercent Coverage\tBLAST Score\t Evalue\n')
                for subject in cluster_protein.subjects.values():
                    gene_id = subject.locus_tag
                    gene_descr = subject.annotation
                    mibig_cluster = subject.genecluster
                    mibig_product = clusters[mibig_cluster][1]
                    percent_id = str(subject.perc_ident)
                    blast_score = str(subject.blastscore)
                    percent_cvg = str(subject.perc_coverage)
                    e_value = str(subject.evalue)
                    outfile.write(gene_id + '\t' + gene_descr + '\t' +
                                  mibig_cluster + '\t' + mibig_product + '\t' +
                                  percent_id + '\t' + percent_cvg + '\t' +
                                  blast_score + '\t' + e_value + '\n')
Ejemplo n.º 19
0
def write(seq_records, options):
    basename = seq_records[0].id
    if options.input_type == 'nucl':
        output_name = path.join(options.outputfoldername,
                                "%s.final.gbk" % basename)
        for rec in seq_records:
            for cluster in utils.get_cluster_features(rec):
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    cluster_rec = rec[cluster.location.start:cluster.location.
                                      end]

                cluster_rec.annotations["date"] = rec.annotations.get(
                    "date", '')
                cluster_rec.annotations["source"] = rec.annotations.get(
                    "source", '')
                cluster_rec.annotations["organism"] = rec.annotations.get(
                    "organism", '')
                cluster_rec.annotations["taxonomy"] = rec.annotations.get(
                    "taxonomy", [])
                cluster_rec.annotations[
                    "data_file_division"] = rec.annotations.get(
                        "data_file_division", 'UNK')
                # our cut-out clusters are always linear
                cluster_rec.annotations["topology"] = "linear"

                cluster_name = path.join(
                    options.outputfoldername, "%s.cluster%03d.gbk" %
                    (basename, utils.get_cluster_number(cluster)))
                seqio.write([cluster_rec], cluster_name, 'genbank')
    else:
        seq_records = seq_record_convert_nucl_to_prot(seq_records, options)
        output_name = path.join(options.outputfoldername,
                                "%s.final.gp" % basename)

    logging.debug("Writing seq_records to %r" % output_name)
    seqio.write(seq_records, output_name, 'genbank')
Ejemplo n.º 20
0
def create_blast_inputs(genecluster, seq_record):
    options = config.get_config()
    #Create input fasta files for BLAST search
    if options.taxon == "plants":
        queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record))
    else:
        queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.start).replace(">","").replace("<","") + "-" + \
                             str(cds.location.end).replace(">","").replace("<",""), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
Ejemplo n.º 21
0
def perform_knownclusterblast(options, seq_record, clusters, proteins):
    # Run BLAST on gene cluster proteins of each cluster and parse output
    logging.debug("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)

    all_names, all_seqs, all_prots = [], [], []
    prots_by_cluster = []
    for genecluster in geneclusters:
        names, seqs, prots = clusterblast.create_blast_inputs(
            genecluster, seq_record)
        all_names.extend(names)
        all_seqs.extend(seqs)
        all_prots.extend(prots)
        prots_by_cluster.append(prots)

    debug_path = os.path.join(options.dbgclusterblast,
                              "knownclusterblastoutput.txt")
    if options.dbgclusterblast and os.path.exists(debug_path):
        logging.debug("Skipping DIAMOND calculations, using previous results")
        with open(debug_path, "r") as fh:
            blastoutput = fh.read()
    else:
        with TemporaryDirectory(change=True) as tempdir:
            utils.writefasta(
                [qcname.replace(" ", "_") for qcname in all_names], all_seqs,
                "input.fasta")
            out, err, retcode = clusterblast.run_diamond(
                "input.fasta",
                os.path.join(options.knownclusterblastdir,
                             'knownclusterprots'), tempdir, options)
            if retcode != 0:
                logging.debug("out: %r, err: %r, retcode: %s", out, err,
                              retcode)
            with open("input.out", 'r') as fh:
                blastoutput = fh.read()
            clusterblast.write_raw_clusterblastoutput(
                options.full_outputfolder_path,
                blastoutput,
                searchtype="knownclusters")

    minseqcoverage = 40
    minpercidentity = 45
    clusters_by_number, _ = clusterblast.parse_all_clusters(
        blastoutput, minseqcoverage, minpercidentity, seq_record)

    knownclusterblastStorage = utils.Storage()
    knownclusterblastStorage.clusters = clusters
    knownclusterblastStorage.proteins = proteins

    for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster):
        clusternumber = utils.get_cluster_number(genecluster)
        cluster_names_to_queries = clusters_by_number.get(clusternumber, {})
        allcoregenes = [
            utils.get_gene_id(cds)
            for cds in utils.get_secmet_cds_features(seq_record)
        ]
        ranking = clusterblast.score_clusterblast_output(
            clusters, allcoregenes, cluster_names_to_queries)

        # store all clusterblast related data in a utils.Storage object and serialize it
        knownclusterblastStorage.clusternumber = clusternumber
        knownclusterblastStorage.queryclusterprots = queryclusterprots
        knownclusterblastStorage.ranking = ranking
        clusterblast.write_clusterblast_output(options,
                                               seq_record,
                                               knownclusterblastStorage,
                                               searchtype="knownclusters")

    mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters,
                           options)
Ejemplo n.º 22
0
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations,
                              proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "knwonclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "knownclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                utils.writefasta(
                    [qcname.replace(" ", "_") for qcname in queryclusternames],
                    queryclusterseqs, "input.fasta")
                out, err, retcode = run_diamond(
                    "input.fasta",
                    path.join(options.knownclusterblastdir,
                              'knownclusterprots'), tempdir, options)
                if retcode != 0:
                    logging.debug("out: %r, err: %r, retcode: %s", out, err,
                                  retcode)
                convert_to_tabular(tempdir)
                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="knownclusters")
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                knownclusterblastStorage = utils.Storage()
                knownclusterblastStorage.clusternumber = clusternumber
                knownclusterblastStorage.queryclusterprots = queryclusterprots
                knownclusterblastStorage.clusters = clusters
                knownclusterblastStorage.hitclusterdata = hitclusterdata
                knownclusterblastStorage.rankedclusters = rankedclusters
                knownclusterblastStorage.rankedclustervalues = rankedclustervalues
                knownclusterblastStorage.proteintags = proteintags
                knownclusterblastStorage.proteinlocations = proteinlocations
                knownclusterblastStorage.proteinannotations = proteinannotations
                knownclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          knownclusterblastStorage,
                                          searchtype="knownclusters")
Ejemplo n.º 23
0
def perform_clusterblast(options, seq_record, clusters, proteins):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    debug_path = os.path.abspath(
        os.path.join(options.dbgclusterblast, "clusterblastoutput.txt"))
    with TemporaryDirectory(change=True) as tempdir:
        all_names, all_seqs, all_prots = [], [], []
        prots_by_cluster = []
        for genecluster in geneclusters:
            names, seqs, prots = create_blast_inputs(genecluster, seq_record)
            all_names.extend(names)
            all_seqs.extend(seqs)
            all_prots.extend(prots)
            prots_by_cluster.append(prots)
        if options.dbgclusterblast and os.path.exists(debug_path):
            logging.debug(
                "Skipping DIAMOND calculations, using results from %s instead",
                debug_path)
            with open(debug_path, "r") as fh:
                blastoutput = fh.read()
            logging.debug("    Parsing results from given file...")
        else:
            logging.debug("Running DIAMOND gene cluster search..")
            utils.writefasta(all_names, all_seqs, "input.fasta")
            out, err, retcode = run_diamond(
                "input.fasta",
                path.join(options.clusterblastdir, "geneclusterprots"),
                tempdir, options)
            if retcode != 0:
                logging.error(
                    "Running diamond failed: returned %s, stderr: %r, stdout: %r",
                    retcode, err, out)
            logging.debug("   DIAMOND search finished. Parsing results...")

            with open("input.out", 'r') as fh:
                blastoutput = fh.read()

        write_raw_clusterblastoutput(options.full_outputfolder_path,
                                     blastoutput)

        minseqcoverage = 10
        minpercidentity = 30
        clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage,
                                                   minpercidentity, seq_record)

        clusterblastStorage = utils.Storage()
        clusterblastStorage.clusters = clusters
        clusterblastStorage.proteins = proteins

        for genecluster, queryclusterprots in zip(geneclusters,
                                                  prots_by_cluster):
            clusternumber = utils.get_cluster_number(genecluster)
            cluster_names_to_queries = clusters_by_number.get(
                clusternumber, {})
            allcoregenes = [
                utils.get_gene_acc(cds)
                for cds in utils.get_secmet_cds_features(seq_record)
            ]
            ranking = score_clusterblast_output(clusters, allcoregenes,
                                                cluster_names_to_queries)

            # store all clusterblast related data in a utils.Storage object
            clusterblastStorage.clusternumber = clusternumber
            clusterblastStorage.queryclusterprots = queryclusterprots
            clusterblastStorage.ranking = ranking

            write_clusterblast_output(options, seq_record, clusterblastStorage)
Ejemplo n.º 24
0
def run_coexpress(seq_record, all_gene_expressions, geo):
    options = get_config()
    cl_count = 1
    cl_list = utils.get_cluster_features(seq_record)

    gene_expressions = all_gene_expressions[seq_record.id]

    logging.info('Running CoExpress analysis on the clusters..')
    for cluster in cl_list:
        logging.debug(
            'Running CoExpress analysis on record "%s".. (Cluster %s of %s)' %
            (geo["info"]["id"], cl_count, len(cl_list)))
        features = utils.get_cluster_cds_features(cluster, seq_record)
        cl_count += 1
        cluster_genes = {}

        for feature in features:
            gene_id = utils.get_gene_id(feature)
            if gene_id in gene_expressions:
                cluster_genes[gene_id] = gene_expressions[gene_id]

        #calculate correlation value between genes
        for gene_1 in cluster_genes:
            if "cor" not in cluster_genes[gene_1]:
                cluster_genes[gene_1]["cor"] = {}
            if "exp" not in cluster_genes[gene_1]:
                continue
            for gene_2 in cluster_genes:
                if "cor" not in cluster_genes[gene_2]:
                    cluster_genes[gene_2]["cor"] = {}
                if gene_2 == gene_1:
                    continue
                if "exp" not in cluster_genes[gene_2]:
                    continue
                if gene_1 in cluster_genes[gene_2]["cor"]:
                    continue
                cor_val = calc_correlation_value(cluster_genes[gene_1],
                                                 cluster_genes[gene_2])
                cluster_genes[gene_1]["cor"][gene_2] = cor_val
                cluster_genes[gene_2]["cor"][gene_1] = cor_val

        #calculate distance value for building dendogram
        for gene_1 in cluster_genes:
            if "dist" not in cluster_genes[gene_1]:
                cluster_genes[gene_1]["dist"] = {}
            for gene_2 in cluster_genes:
                if "dist" not in cluster_genes[gene_2]:
                    cluster_genes[gene_2]["dist"] = {}
                dist = 100.0
                if "cor" in cluster_genes[gene_1] and gene_2 in cluster_genes[
                        gene_1]["cor"]:
                    cor_val = min(1.00, cluster_genes[gene_1]["cor"][gene_2])
                    dist = 100.0 * (1.0 - cor_val)
                cluster_genes[gene_1]["dist"][gene_2] = dist
                cluster_genes[gene_2]["dist"][gene_1] = dist

        # check for remote genes, add if correlation value >= 0.9
        for gene_1 in cluster_genes:
            for seqid in all_gene_expressions:
                prefix = "%s:" % seqid.replace(":", "_")
                for gene_2 in all_gene_expressions[seqid]:
                    if (
                            prefix + gene_2
                    ) not in options.hmm_results:  # only add biosynthetic remote genes
                        continue
                    if gene_2 == gene_1:
                        continue
                    if gene_2 in cluster_genes:
                        continue
                    cor_val = min(
                        1.00,
                        calc_correlation_value(
                            cluster_genes[gene_1],
                            all_gene_expressions[seqid][gene_2]))
                    if 1.00 > cor_val >= 0.9:
                        cluster_genes[gene_1]["dist"][gene_2] = 100.0 * (
                            1.0 - cor_val)

        # review the remote genes, discard genes with less than 2 edges
        if True:
            edges_count = {}
            for gene_1 in cluster_genes:
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if gene_2 not in cluster_genes:
                        if gene_2 not in edges_count:
                            edges_count[gene_2] = 0
                        edges_count[gene_2] += 1
            for gene_1 in cluster_genes:
                new_dists = {}
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if (gene_2 in cluster_genes) or (edges_count[gene_2] >= 2):
                        new_dists[gene_2] = cluster_genes[gene_1]["dist"][
                            gene_2]
                cluster_genes[gene_1]["dist"] = new_dists

        # review the remote genes, discard genes without any connection to cluster's biosynthetic genes
        if True:
            have_connections = []
            prefix = "%s:" % seq_record.id.replace(":", "_")
            for gene_1 in cluster_genes:
                if (prefix + gene_1) in options.hmm_results:
                    for gene_2 in cluster_genes[gene_1]["dist"]:
                        if (gene_2 not in cluster_genes) and (
                                gene_2 not in have_connections):
                            have_connections.append(gene_2)
            for gene_1 in cluster_genes:
                new_dists = {}
                for gene_2 in cluster_genes[gene_1]["dist"]:
                    if (gene_2 in cluster_genes) or (gene_2
                                                     in have_connections):
                        new_dists[gene_2] = cluster_genes[gene_1]["dist"][
                            gene_2]
                cluster_genes[gene_1]["dist"] = new_dists

        #update seq_record
        update_features(features, cluster_genes, geo)

    if False:  #This feature is temporarily disabled, saved for next version #options.coexpress_signal_cluster_size < len(overlaps):
        logging.info('Running expression signal analysis on seq_record..')
        signals = []
        n = options.coexpress_signal_cluster_size - 1
        #build list of cluster locations (for annotating signal regions)
        clrefs = []
        for cluster in cl_list:
            clrefs.append(((cluster.location.start, cluster.location.end),
                           utils.get_cluster_number(cluster)))
        clrefs = sorted(clrefs, key=lambda cl: cl[0][0])
        #build signals
        for i in xrange(0, len(overlaps) - n):
            genes = []
            for overlap in overlaps[i:i + n]:
                gene = overlap[0]
                for feature in overlap:
                    if utils.get_gene_id(feature) in gene_expressions:
                        gene = feature
                        break
                genes.append(gene)
            cors = []
            checked = []
            hits = []
            for x in xrange(0, len(genes)):
                gene_x = utils.get_gene_id(genes[x])
                if prefix + gene_x in options.hmm_results:
                    hits.append(options.hmm_results[prefix +
                                                    gene_x][0].query_id)
                for y in xrange(0, len(genes)):
                    if ((x, y) in checked) or ((y, x) in checked):
                        continue
                    cor_val = 0
                    gene_y = utils.get_gene_id(genes[y])
                    if (gene_x in gene_expressions) and (gene_y
                                                         in gene_expressions):
                        cor_val = calc_correlation_value(
                            gene_expressions[gene_x], gene_expressions[gene_y])
                    cors.append(cor_val)
                    checked.append((x, y))
            sloc = (genes[0].location.start + genes[-1].location.end) / 2
            cor_val = 0
            if len(cors) > 0 and len(list(set(hits))) > 1:
                cor_val = np.median(cors)
            cl_idx = -1
            for clref in clrefs:
                if sloc < clref[0][0]:
                    continue
                if sloc <= clref[0][1]:
                    cl_idx = clref[1]
                    break
            signals.append((sloc, cor_val, cl_idx))
        if "coexpress_signal" not in options:
            options.coexpress_signal = {}
        if geo["info"]["id"] not in options.coexpress_signal:
            options.coexpress_signal[geo["info"]["id"]] = {}
        options.coexpress_signal[geo["info"]["id"]][seq_record.id] = signals
Ejemplo n.º 25
0
def write_data_to_seq_record(pksnrpsvars, seq_record, options):
    #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record
    #
    # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers

    for f in utils.get_cluster_features(seq_record):
	cluster_info = f.qualifiers

    for feature in pksnrpsvars.pksnrpscoregenes:
        nrat = 0
        nra = 0
        nrcal = 0
        nrkr = 0
        nrXdom = 0
        secmetqualifiers = feature.qualifiers['sec_met']
        updated_secmetqualifiers = []
        # BiosynML:creating object to add detailed substrate predictions
        updated_secmetqualifiers_predictions = []
        domainFeatures = []
        gene_id = utils.get_gene_id(feature)
        for qualifier in secmetqualifiers:
            if "NRPS/PKS Domain:" not in qualifier:
                updated_secmetqualifiers.append(qualifier)
                updated_secmetqualifiers_predictions.append(qualifier)
            else:
                # extract domain type, start and end position from qualifier string
                match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier)
                if not match_pos_obj:
                    logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier)
                    sys.exit(1)
                domain_type = match_pos_obj.group(1)
                start_aa = int(match_pos_obj.group(2))
                end_aa = int(match_pos_obj.group(3))
                evalue = float(match_pos_obj.group(4))
                score = float (match_pos_obj.group(5))

                #calculate respective positions based on aa coordinates
                if feature.location.strand==1:
                    start = feature.location.start + ( 3 * start_aa )
                    end = feature.location.start + ( 3* end_aa )
                else:
                    end = feature.location.end - ( 3 * start_aa )
                    start = feature.location.end - ( 3 * end_aa)
                loc = FeatureLocation(start, end, strand=feature.strand)

                # set up new CDS_motif feature
                domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag)
                domainFeature.qualifiers['domain'] = [domain_type]
                if feature.qualifiers.has_key('locus_tag'):
                    domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag']
                else:
                    domainFeature.qualifiers['locus_tag'] = [gene_id]
                domainFeature.qualifiers['detection'] = ["hmmscan"]
                domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"]
                domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))]
                domainFeature.qualifiers['score'] = [score]
                if feature.qualifiers.has_key('transl_table'):
                    [transl_table] = feature.qualifiers['transl_table']
                else:
                    transl_table = 1
                domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))]

                domainFeature_specificity = []

                if domain_type == "AMP-binding":
                    nra += 1
                    domainname = gene_id + "_A" + str(nra)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("NRPSpredictor2 SVM: %s" % pksnrpsvars.nrps_svm_preds[domainname])
                    domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.nrps_code_preds[domainname])
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_nrps_preds[domainname])
                    domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname])


                    newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname, pksnrpsvars.nrps_svm_preds[domainname], pksnrpsvars.nrps_code_preds[domainname], pksnrpsvars.minowa_nrps_preds[domainname], pksnrpsvars.consensuspreds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname,pksnrpsvars.nrps_code_preds_details[domainname], pksnrpsvars.nrps_svm_preds_details[domainname],  pksnrpsvars.minowa_nrps_preds_details[domainname], pksnrpsvars.consensuspreds[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_AT":
                    nrat += 1
                    domainname = gene_id + "_AT" + str(nrat)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname])
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname])
                    #For t1pks, t2pks and t3pks
                    if 'transatpks' not in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname])
                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                    #For transatpks
                    elif 'transatpks' in cluster_info['product'][0]:
                        domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname])
                        newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname])
                        # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                        newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname])

                        updated_secmetqualifiers.append(newqualifier)
                        updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "CAL_domain":
                    nrcal += 1
                    domainname = gene_id + "_CAL" + str(nrcal)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname])
                    newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                elif domain_type == "PKS_KR":
                    nrkr += 1
                    domainname = gene_id + "_KR" + str(nrkr)
                    domainFeature.qualifiers['label'] = [domainname]
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname]
                    domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname])
                    domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname])
                    newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    # BiosynML: appending substrate prediction data into 'newqualifier_detailed'
                    newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname])
                    updated_secmetqualifiers.append(newqualifier)
                    updated_secmetqualifiers_predictions.append(newqualifier_detailed)
                else:
                    nrXdom += 1
                    domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)]
                    updated_secmetqualifiers.append(qualifier)
                domainFeature.qualifiers['specificity'] = domainFeature_specificity
                if _map_domaintype(domain_type):
                    domainFeature.qualifiers['domain_subtype'] = [domain_type]
                    domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)]
                domainFeatures.append(domainFeature)

        feature.qualifiers['sec_met'] = updated_secmetqualifiers
        # BiosynML: creating new 'sec_met_predictions' qualifier
        #feature.qualifiers['sec_met_predictions'] = updated_secmetqualifiers_predictions
        seq_record.features.extend(domainFeatures)

        if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id):
            feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id])

    #Save consensus structure + link to structure image to seq_record
    clusters = utils.get_cluster_features(seq_record)
    for cluster in clusters:
        clusternr = utils.get_cluster_number(cluster)
        if pksnrpsvars.compound_pred_dict.has_key(clusternr):
            structpred = pksnrpsvars.compound_pred_dict[clusternr]
            cluster.qualifiers['note'].append("Monomers prediction: " + structpred)
            cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
Ejemplo n.º 26
0
def write(seq_records, options):
    if options.input_type == 'prot':
        return
    #Open up TXT file and XLS record
    outfolder = options.full_outputfolder_path
    txtfile = open(path.join(outfolder, "geneclusters.txt"), "w")
    wb = Workbook()
    font1 = Font()
    style1 = XFStyle()
    style1.font = font1
    font1.bold = True
    ws0 = wb.add_sheet('0')
    ws0.write(0, 0, "Input accession number", style1)
    ws0.write(0, 1, "Input name", style1)
    ws0.write(0, 2, "Gene cluster type", style1)
    ws0.write(0, 3, "Gene cluster genes", style1)
    ws0.write(0, 4, "Gene cluster gene accessions", style1)
    if options.knownclusterblast:
        ws0.write(0, 5, "Compound with gene cluster of highest homology",
                  style1)
    #For each gene cluster, write out info
    column = 1
    for seq_record in seq_records:
        clusters = utils.get_cluster_features(seq_record)
        for cluster in clusters:
            clustertype = utils.get_cluster_type(cluster)
            clusternr = utils.get_cluster_number(cluster)
            clustergenes = [
                utils.get_gene_id(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            accessions = [
                utils.get_gene_acc(cds)
                for cds in utils.get_cluster_cds_features(cluster, seq_record)
            ]
            ws0.write(column, 0, seq_record.id)
            try:
                ws0.write(column, 1, seq_record.description)
            except:
                ws0.write(
                    column, 1,
                    "Name to long to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            ws0.write(column, 2, clustertype)
            try:
                ws0.write(column, 3, ";".join(clustergenes))
            except:
                ws0.write(
                    column, 3,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            try:
                ws0.write(column, 4, ";".join(accessions))
            except:
                ws0.write(
                    column, 4,
                    "Too many genes to be contained in Excel cell; see txt file in downloadable zip archive."
                )
            if hasattr(seq_record, 'closestcompounddict') and \
               seq_record.closestcompounddict.has_key(clusternr):
                ws0.write(column, 5, seq_record.closestcompounddict[clusternr])
            column += 1
            txtfile.write("\t".join([
                seq_record.id, seq_record.description, clustertype, ";".join(
                    clustergenes), ";".join(accessions)
            ]) + "\n")
    wb.save(path.join(outfolder, "%s.geneclusters.xls" % seq_record.id))
Ejemplo n.º 27
0
def generate_chemical_structure_preds(pksnrpsvars, seq_record, options):
    #Create directory to store structures
    options.structuresfolder = path.abspath(path.join(options.outputfoldername, "structures"))
    if not os.path.exists(options.structuresfolder):
        os.mkdir(options.structuresfolder)

    #Combine predictions into a prediction of the final chemical structure and generate images
    geneclusters = utils.get_cluster_features(seq_record)

    for genecluster in geneclusters:
        geneclusternr = utils.get_cluster_number(genecluster)
        smiles_string = ""
        if pksnrpsvars.compound_pred_dict.has_key(geneclusternr):

            #print "output_modules/html/pksnrpsvars.compound_pred_dict:"
            #print pksnrpsvars.compound_pred_dict

            residues = pksnrpsvars.compound_pred_dict[geneclusternr].replace("(","").replace(")","").replace(" + "," ").replace("-"," ")


            #Now generates SMILES of predicted secondary metabolites without NP.searcher
            residuesList = residues.split(" ")

            #Counts the number of malonate and its derivatives in polyketides
            mal_count = 0
            for i in residuesList:
                if "mal" in i:
                    mal_count += 1

            nrresidues = len(residuesList)

            #Reflecting reduction states of ketide groups starting at beta carbon of type 1 polyketide
            if "pk" in residuesList and "mal" in residuesList[-1]:
                residuesList.pop(residuesList.index('pk')+1)
                residuesList.append('pks-end1')
            elif mal_count == len(residuesList):
                if residuesList[0] == "mal":
                    residuesList[0] = "pks-start1"
                if residuesList[-1] == "ccmal":
                    residuesList.append('pks-end2')

            if nrresidues > 1:
                #Conventionally used aaSMILES was used;
                #chirality expressed with "@@" causes indigo error
                smiles_monomer = open(os.path.dirname(os.path.realpath(__file__)) + os.sep + 'aaSMILES.txt','r')
                smiles = smiles_monomer.readline()
                smiles = smiles_monomer.readline()

                aa_smiles_dict = {}
                while smiles:
                    smiles = smiles.split()
                    if len(smiles) > 1:
                        smiles[0] = smiles[0].strip()
                        smiles[1] = smiles[1].strip()
                        aa_smiles_dict[smiles[0]] = smiles[1]
                    smiles = smiles_monomer.readline()
                smiles_monomer.close()

                for monomer in residuesList:
                    if monomer in aa_smiles_dict.keys():
                        smiles_string += aa_smiles_dict[monomer]
                logging.debug("Cluster %s: smiles_string: %s", geneclusternr, smiles_string)
                with TemporaryDirectory(change=True):
                    smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w")
                    smilesfile.write(smiles_string)
                    smilesfile.close()
                    depictstatus = depict_smile(geneclusternr, options.structuresfolder)
                if depictstatus == "failed":
                    pksnrpsvars.failedstructures.append(geneclusternr)
        elif utils.get_cluster_type(genecluster) == "ectoine":
            smiles_string = "CC1=NCCC(N1)C(=O)O"
            with TemporaryDirectory(change=True):
                smilesfile = open("genecluster" + str(geneclusternr) + ".smi", "w")
                smilesfile.write(smiles_string)
                smilesfile.close()
                depictstatus = depict_smile(geneclusternr, options.structuresfolder)
            if depictstatus == "failed":
                pksnrpsvars.failedstructures.append(geneclusternr)
            elif genecluster in pksnrpsvars.failedstructures:
                del pksnrpsvars.failedstructures[pksnrpsvars.failedstructures.index(geneclusternr)]
            pksnrpsvars.compound_pred_dict[geneclusternr] = "ectoine"
        _update_sec_met_entry(genecluster, smiles_string)