コード例 #1
0
def perform_subclusterblast(options, seq_record, clusters, proteinlocations,
                            proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running NCBI BLAST+ subcluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True):
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "subclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "subclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:
                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                write_clusterblast_inputfiles(options, queryclusternames,
                                              queryclusterseqs)
                run_clusterblast_processes(options, searchtype="subclusters")
                blastoutput = read_clusterblast_output(options)
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="subclusters")
                logging.info("   Blast search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                subclusterblastStorage = utils.Storage()
                subclusterblastStorage.clusternumber = clusternumber
                subclusterblastStorage.queryclusterprots = queryclusterprots
                subclusterblastStorage.clusters = clusters
                subclusterblastStorage.hitclusterdata = hitclusterdata
                subclusterblastStorage.rankedclusters = rankedclusters
                subclusterblastStorage.rankedclustervalues = rankedclustervalues
                subclusterblastStorage.proteintags = proteintags
                subclusterblastStorage.proteinlocations = proteinlocations
                subclusterblastStorage.proteinannotations = proteinannotations
                subclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          subclusterblastStorage,
                                          searchtype="subclusters")
コード例 #2
0
ファイル: clusterblast.py プロジェクト: abner24/plantismash
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND gene cluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"):
                logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast"  + os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record)
                utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta")
                if options.taxon == "plants":
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options)
                else:
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options)
                if retcode != 0:
                    logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)
                out, err, retcode = convert_to_tabular(tempdir)
                if retcode != 0:
                    logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)

                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()

                write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput)
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 10
                minpercidentity = 30
                blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object
                clusterblastStorage = utils.Storage()
                clusterblastStorage.clusternumber = clusternumber
                clusterblastStorage.queryclusterprots = queryclusterprots
                clusterblastStorage.clusters = clusters
                clusterblastStorage.hitclusterdata = hitclusterdata
                clusterblastStorage.rankedclusters = rankedclusters
                clusterblastStorage.rankedclustervalues = rankedclustervalues
                clusterblastStorage.proteintags = proteintags
                clusterblastStorage.proteinlocations = proteinlocations
                clusterblastStorage.proteinannotations = proteinannotations
                clusterblastStorage.proteinstrands = proteinstrands


                #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands)
                write_clusterblast_output(options, seq_record, clusterblastStorage)
コード例 #3
0
def write(seq_records, options):
    logging.debug("Exporting antiSMASH information as txt tables")
    #Don't store TXT tables for protein input
    if options.input_type == 'prot':
        return
    #Localize output folder, create TXT subdirectory
    txt_outfolder = options.full_outputfolder_path + os.sep + "txt"
    if not os.path.exists(txt_outfolder):
        os.mkdir(txt_outfolder)
    #Define table names
    tables = "genome", "BGC", "signature_gene_info", "gene", "NRPS_PKS", "smCOG", "RiPP", "transltable"
    #For each gene cluster, write out info to TXT files
    for seq_record in seq_records:
        if len(utils.get_cluster_features(seq_record)) > 0:
            #Open up TXT files
            txt_files = {}
            for table in tables:
                txt_files[table] = open(
                    path.join(
                        txt_outfolder, "%s_%s.txt" %
                        (seq_record.id.partition(".")[0], table)), "w")
            #Gather all information
            info = utils.Storage()
            info.clustertypes, info.clustergenes, info.accessions, info.cdsmotifs, info.clusternrs = {}, {}, {}, {}, []
            clusters = utils.get_cluster_features(seq_record)
            for cluster in clusters:
                clusternr = utils.get_cluster_number(cluster)
                info.clusternrs.append(clusternr)
                info.clustertypes[clusternr] = utils.get_cluster_type(cluster)
                info.clustergenes[clusternr] = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.accessions[clusternr] = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_cluster_cds_features(
                        cluster, seq_record)
                ]
                info.cdsmotifs[clusternr] = utils.get_all_features_of_type(
                    seq_record, ["CDS_motif"])
            info.seq_record = seq_record
            #Write information to tables
            for table in tables:
                getattr(write_tables, 'write_' + table)(txt_files[table], info,
                                                        options)
            for table in tables:
                txt_files[table].close()
コード例 #4
0
ファイル: __init__.py プロジェクト: chevrm/transPACT
def run_knownclusterblast(seq_record, options):
    logging.info('Running known cluster search')
    knownclusterblastvars = utils.Storage()
    knownclusterblastvars.internalhomologygroupsdict = {}
    knownclusterblastvars.clusterblastpositiondata = {}
    knownclusterblastvars.queryclusterdata = {}
    clusters, proteins = load_clusterblast_database(seq_record,
                                                    searchtype="knownclusters")
    if not options.clusterblast:
        seq_record.internalhomologygroupsdict = internal_homology_blast(
            seq_record)
    perform_knownclusterblast(options, seq_record, clusters, proteins)
    prepare_data(seq_record, options, searchtype="knownclusters")
    generate_Storage_for_cb(options,
                            seq_record,
                            searchtype="KnownClusterBlastData")
コード例 #5
0
def generate_structure_images(seq_records, options):
    "Generate the structure images based on Monomers prediction in cluster feature"
    
    for seq_record in seq_records:
        # Ugly temporary solution:
        # At first we have to regenerate the relevant information for the pksnrpsvars dictionary from the seq_record file
        pksnrpsvars = utils.Storage()
        pksnrpsvars.compound_pred_dict = {}
        pksnrpsvars.failedstructures = []
        
        geneclusters = utils.get_cluster_features(seq_record)
        
        for genecluster in geneclusters:
            geneclusternr = utils.get_cluster_number(genecluster)
            pksnrpsvars.compound_pred_dict[geneclusternr] = utils.get_structure_pred(genecluster)
        if len(pksnrpsvars.compound_pred_dict) > 0:
            generate_chemical_structure_preds(pksnrpsvars, seq_record, options)
コード例 #6
0
ファイル: __init__.py プロジェクト: abner24/plantismash
def run_subclusterblast(seq_record, options):
    logging.info('Running subcluster search')
    subclusterblastvars = utils.Storage()
    subclusterblastvars.internalhomologygroupsdict = {}
    subclusterblastvars.clusterblastpositiondata = {}
    subclusterblastvars.queryclusterdata = {}
    clusters, proteinlocations, proteinstrands, proteinannotations, proteintags = load_clusterblast_database(
        seq_record, searchtype="subclusters")
    if not options.clusterblast:
        seq_record.internalhomologygroupsdict = internal_homology_blast(
            seq_record)
    perform_subclusterblast(options, seq_record, clusters, proteinlocations,
                            proteinstrands, proteinannotations, proteintags)
    prepare_data(seq_record, options, searchtype="subclusters")
    generate_Storage_for_cb(options,
                            seq_record,
                            searchtype="SubClusterBlastData")
コード例 #7
0
def create_pksnrpsvars_object():
    #Storage object for all NRPS/PKS data
    pksnrpsvars = utils.Storage()
    #Dictionary, key: gene cluster nr, value: gene cluster type
    pksnrpsvars.nrpspkstypedict = {}
    #Dictionary, key: gene cluster nr, value: monomers string
    pksnrpsvars.compound_pred_dict = {}
    #Dictionary, key: gene ID, value: lists of result lists which each contain [result.hit_id, result.query_start, result.query_end, result.evalue, result.bitscore]
    pksnrpsvars.consensuspred_gene_dict = {}
    #Dictionary, key: gene ID, value: lists of result lists which each contain [result.hit_id, result.query_start, result.query_end, result.evalue, result.bitscore]
    pksnrpsvars.domaindict = {}
    #List of gene cluster nrs with failed structure generation
    pksnrpsvars.failedstructures = []
    #List of gene cluster nrs for which to create docking domain analysis details HTML files
    pksnrpsvars.dockingdomainanalysis = []
    #List of gene IDs of PKS/NRPS core genes
    pksnrpsvars.pksnrpscoregenes = []
    return pksnrpsvars
コード例 #8
0
def perform_knownclusterblast(options, seq_record, clusters, proteins):
    # Run BLAST on gene cluster proteins of each cluster and parse output
    logging.debug("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)

    all_names, all_seqs, all_prots = [], [], []
    prots_by_cluster = []
    for genecluster in geneclusters:
        names, seqs, prots = clusterblast.create_blast_inputs(
            genecluster, seq_record)
        all_names.extend(names)
        all_seqs.extend(seqs)
        all_prots.extend(prots)
        prots_by_cluster.append(prots)

    debug_path = os.path.join(options.dbgclusterblast,
                              "knownclusterblastoutput.txt")
    if options.dbgclusterblast and os.path.exists(debug_path):
        logging.debug("Skipping DIAMOND calculations, using previous results")
        with open(debug_path, "r") as fh:
            blastoutput = fh.read()
    else:
        with TemporaryDirectory(change=True) as tempdir:
            utils.writefasta(
                [qcname.replace(" ", "_") for qcname in all_names], all_seqs,
                "input.fasta")
            out, err, retcode = clusterblast.run_diamond(
                "input.fasta",
                os.path.join(options.knownclusterblastdir,
                             'knownclusterprots'), tempdir, options)
            if retcode != 0:
                logging.debug("out: %r, err: %r, retcode: %s", out, err,
                              retcode)
            with open("input.out", 'r') as fh:
                blastoutput = fh.read()
            clusterblast.write_raw_clusterblastoutput(
                options.full_outputfolder_path,
                blastoutput,
                searchtype="knownclusters")

    minseqcoverage = 40
    minpercidentity = 45
    clusters_by_number, _ = clusterblast.parse_all_clusters(
        blastoutput, minseqcoverage, minpercidentity, seq_record)

    knownclusterblastStorage = utils.Storage()
    knownclusterblastStorage.clusters = clusters
    knownclusterblastStorage.proteins = proteins

    for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster):
        clusternumber = utils.get_cluster_number(genecluster)
        cluster_names_to_queries = clusters_by_number.get(clusternumber, {})
        allcoregenes = [
            utils.get_gene_id(cds)
            for cds in utils.get_secmet_cds_features(seq_record)
        ]
        ranking = clusterblast.score_clusterblast_output(
            clusters, allcoregenes, cluster_names_to_queries)

        # store all clusterblast related data in a utils.Storage object and serialize it
        knownclusterblastStorage.clusternumber = clusternumber
        knownclusterblastStorage.queryclusterprots = queryclusterprots
        knownclusterblastStorage.ranking = ranking
        clusterblast.write_clusterblast_output(options,
                                               seq_record,
                                               knownclusterblastStorage,
                                               searchtype="knownclusters")

    mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters,
                           options)
コード例 #9
0
ファイル: clusterblast.py プロジェクト: chevrm/transPACT
def perform_clusterblast(options, seq_record, clusters, proteins):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    debug_path = os.path.abspath(
        os.path.join(options.dbgclusterblast, "clusterblastoutput.txt"))
    with TemporaryDirectory(change=True) as tempdir:
        all_names, all_seqs, all_prots = [], [], []
        prots_by_cluster = []
        for genecluster in geneclusters:
            names, seqs, prots = create_blast_inputs(genecluster, seq_record)
            all_names.extend(names)
            all_seqs.extend(seqs)
            all_prots.extend(prots)
            prots_by_cluster.append(prots)
        if options.dbgclusterblast and os.path.exists(debug_path):
            logging.debug(
                "Skipping DIAMOND calculations, using results from %s instead",
                debug_path)
            with open(debug_path, "r") as fh:
                blastoutput = fh.read()
            logging.debug("    Parsing results from given file...")
        else:
            logging.debug("Running DIAMOND gene cluster search..")
            utils.writefasta(all_names, all_seqs, "input.fasta")
            out, err, retcode = run_diamond(
                "input.fasta",
                path.join(options.clusterblastdir, "geneclusterprots"),
                tempdir, options)
            if retcode != 0:
                logging.error(
                    "Running diamond failed: returned %s, stderr: %r, stdout: %r",
                    retcode, err, out)
            logging.debug("   DIAMOND search finished. Parsing results...")

            with open("input.out", 'r') as fh:
                blastoutput = fh.read()

        write_raw_clusterblastoutput(options.full_outputfolder_path,
                                     blastoutput)

        minseqcoverage = 10
        minpercidentity = 30
        clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage,
                                                   minpercidentity, seq_record)

        clusterblastStorage = utils.Storage()
        clusterblastStorage.clusters = clusters
        clusterblastStorage.proteins = proteins

        for genecluster, queryclusterprots in zip(geneclusters,
                                                  prots_by_cluster):
            clusternumber = utils.get_cluster_number(genecluster)
            cluster_names_to_queries = clusters_by_number.get(
                clusternumber, {})
            allcoregenes = [
                utils.get_gene_acc(cds)
                for cds in utils.get_secmet_cds_features(seq_record)
            ]
            ranking = score_clusterblast_output(clusters, allcoregenes,
                                                cluster_names_to_queries)

            # store all clusterblast related data in a utils.Storage object
            clusterblastStorage.clusternumber = clusternumber
            clusterblastStorage.queryclusterprots = queryclusterprots
            clusterblastStorage.ranking = ranking

            write_clusterblast_output(options, seq_record, clusterblastStorage)
コード例 #10
0
ファイル: __init__.py プロジェクト: abner24/plantismash
def run_smcog_analysis(seq_record, options):
    #run_smcog_analysis(opts, globalvars, geneclustervars, pksnrpscoregenes)
    logging.info('Running smCOG analysis')
    smcogvars = utils.Storage()
    smcogvars.smcogtreedict = {}
    smcogvars.smcogdict = {}
    geneclustergenes = utils.get_withincluster_cds_features(seq_record)
    pksnrpscoregenes = utils.get_pksnrps_cds_features(seq_record)
    logging.info("Performing smCOG analysis")
    smcogs_fasta = utils.get_specific_multifasta(geneclustergenes)
    smcogs_opts = ["-E", "1E-6"]
    smcogs_results = utils.run_hmmscan(utils.get_full_path(__file__, "smcogs.hmm"), smcogs_fasta, smcogs_opts)
    hmmlengthsdict = utils.hmmlengths(utils.get_full_path(__file__, "smcogs.hmm"))
    smcogvars.smcogdict = parse_hmmscan_results(smcogs_results, hmmlengthsdict)
    #Write output
    options.smcogsfolder = path.abspath(path.join(options.outputfoldername, "smcogs"))
    if not os.path.exists(options.smcogsfolder):
        os.mkdir(options.smcogsfolder)
    originaldir = os.getcwd()
    os.chdir(options.smcogsfolder)
    smcogfile = open("smcogs.txt","w")
    pksnrpscoregenenames = [utils.get_gene_id(feature) for feature in pksnrpscoregenes]
    for feature in geneclustergenes:
        k = utils.get_gene_id(feature)
        if k not in pksnrpscoregenenames:
            if smcogvars.smcogdict.has_key(k):
                l = smcogvars.smcogdict[k]
                smcogfile.write(">> " + k + "\n")
                smcogfile.write("name\tstart\tend\te-value\tscore\n")
                smcogfile.write("** smCOG hits **\n")
                for i in l:
                    smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n")
                smcogfile.write("\n\n")
    smcogfile.close()
    #smCOG phylogenetic tree construction
    logging.info("Calculating and drawing phylogenetic trees of cluster genes "
        "with smCOG members")
    with TemporaryDirectory(change=True):
        smcoganalysisgenes = []
        for feature in geneclustergenes:
            k = utils.get_gene_id(feature)
            if k not in pksnrpscoregenenames:
                smcoganalysisgenes.append(feature)
        smcogsets = []
        equalpartsizes = int(len(smcoganalysisgenes)/options.cpus)
        for i in range(options.cpus):
            if i == 0:
                geneslist = smcoganalysisgenes[:equalpartsizes]
            elif i == (options.cpus - 1):
                geneslist = smcoganalysisgenes[(i*equalpartsizes):]
            else:
                geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)]
            smcogsets.append(geneslist)
        processes = []
        z = 0
        for k in smcogsets:
            processes.append(Process(target=smcog_analysis,
                                     args=[k, z, seq_record,
                                        smcogvars.smcogdict, options.smcogsfolder]))
            z += 1
        for k in processes:
            k.start()
        time.sleep(1)
        while True:
            processrunning = "n"
            for k in processes:
                if k.is_alive():
                    processrunning = "y"
            if processrunning == "y":
                time.sleep(5)
            else:
                break
        for k in processes:
            k.join()
    os.chdir(options.smcogsfolder)
    dircontents = os.listdir(os.getcwd())
    for k in dircontents:
        if ".png" in k:
            tag = k.split(".png")[0]
            smcogvars.smcogtreedict[tag] = tag + ".png"
    os.chdir(originaldir)
    _annotate(geneclustergenes, smcogvars, options)
コード例 #11
0
def generate_Storage_for_cb(options,
                            seq_record,
                            searchtype="ClusterBlastData"):
    """
    This is a very ugly helper function to convert all data stored in "non_standard" lists/dictionaries within the seq_record object
    into a storage object which can be saved in a qualifier...
    
    THIS SHOULD BE REFACTORED so that the information is directly stored in this object instead of this ugly conversion...
    """
    clusterBlastResults = utils.Storage()
    try:
        clusterBlastResults.internalhomologygroupsdict = seq_record.internalhomologygroupsdict
    except AttributeError:
        logging.debug("seq_record.internalhomologygroupsdict does not exist")
    try:
        clusterBlastResults.known_compound_dict = seq_record.known_compound_dict
        del seq_record.known_compound_dict
    except AttributeError:
        logging.debug("seq_record.known_compound_dict does not exist.")
    try:
        clusterBlastResults.pubchem_dict = seq_record.pubchem_dict
        del seq_record.pubchem_dict
    except AttributeError:
        logging.debug("seq_record.pubchem_dict does not exist.")
    try:
        clusterBlastResults.pubmed_dict = seq_record.pubmed_dict
        del seq_record.pubmed_dict
    except AttributeError:
        logging.debug("seq_record.pubmed_dict does not exist.")

    if searchtype == "ClusterBlastData":
        try:
            clusterBlastResults.nrhitgeneclusters = seq_record.nrhitgeneclusters
            del seq_record.nrhitgeneclusters
        except AttributeError:
            logging.debug("seq_record.nrhitgeneclusters does not exist.")
        try:
            clusterBlastResults.qgeneclusterdata = seq_record.qgeneclusterdata
            del seq_record.qgeneclusterdata
        except AttributeError:
            logging.debug("qgeneclusterdata does not exist.")
        try:
            clusterBlastResults.queryclusterdata = seq_record.queryclusterdata
            del seq_record.queryclusterdata
        except AttributeError:
            logging.debug("seq_record.queryclusterdata does not exist.")

        if 'pubmed_dict' in seq_record:
            clusterBlastResults.pubmed_dict = seq_record.pubmed_dict
        if 'pubchem_dict' in seq_record:
            clusterBlastResults.pubchem_dict = seq_record.pubchem_dict
        if 'known_compound_dict' in seq_record:
            clusterBlastResults.known_compound_dict = seq_record.known_compound_dict
        if 'closestcompounddict' in seq_record:
            clusterBlastResults.closestcompounddict = seq_record.closestcompounddict

    if searchtype == "SubClusterBlastData":
        try:
            clusterBlastResults.sc_nrhitgeneclusters = seq_record.sc_nrhitgeneclusters
            del seq_record.sc_nrhitgeneclusters
        except AttributeError:
            logging.debug("seq_record.sc_nrhitgeneclusters does not exist.")
        try:
            clusterBlastResults.sc_qgeneclusterdata = seq_record.sc_qgeneclusterdata
            del seq_record.sc_qgeneclusterdata
        except AttributeError:
            logging.debug("seq_record.sc_qgeneclusterdata does not exist.")
        try:
            clusterBlastResults.sc_queryclusterdata = seq_record.sc_queryclusterdata
            del seq_record.sc_queryclusterdata
        except AttributeError:
            logging.debug("seq_record.sc_queryclusterdata does not exist.")

    if searchtype == "KnownClusterBlastData":
        try:
            clusterBlastResults.kc_nrhitgeneclusters = seq_record.kc_nrhitgeneclusters
            del seq_record.kc_nrhitgeneclusters
        except AttributeError:
            logging.debug("seq_record.kc_nrhitgeneclusters does not exist.")
        try:
            clusterBlastResults.kc_qgeneclusterdata = seq_record.kc_qgeneclusterdata
            del seq_record.sc_qgeneclusterdata
        except AttributeError:
            logging.debug("seq_record.kc_qgeneclusterdata does not exist.")
        try:
            clusterBlastResults.kc_queryclusterdata = seq_record.kc_queryclusterdata
            del seq_record.kc_queryclusterdata
        except AttributeError:
            logging.debug("seq_record.kc_queryclusterdata does not exist.")

    if not 'extrarecord' in options:
        options.extrarecord = {}
    if not options.extrarecord.has_key(seq_record.id):
        options.extrarecord[seq_record.id] = Namespace()

    if not 'extradata' in options.extrarecord[seq_record.id]:
        options.extrarecord[seq_record.id].extradata = {}
    logging.debug("Storing data for %s in storage object" % searchtype)
    options.extrarecord[
        seq_record.id].extradata[searchtype] = clusterBlastResults
コード例 #12
0
ファイル: automodel.py プロジェクト: stogqy/plantismash
def run_automodel(seq_records, options):


    #List of input (static) files as pickles
    ###################################################################
    #For model pruning phase
    #Choose "eco" or "sco"
    #root = os.path.dirname(utils.get_full_path('__file__', 'input1'))
    if not cobra.__version__ == "0.2.1":
        logging.error("The modeling pipeline is only compatible wit COBRApy version 0.2.1; your insstalled version is %s",
                      cobra.__version__)
        return False

    root = os.path.dirname(os.path.realpath(__file__)) + os.sep + 'input1'
    temp_fasta = options.metabolicmodeldir

    #Get template model-specific pickles
    logging.debug("[metabolicmodel] set up model dir as %s and output dir as %s", root, temp_fasta)

    model = pickle.load(open(root+os.sep+options.modeling+os.sep+'model.p','rb'))
    tempModel_biggRxnid_locusTag_dict = pickle.load(open(root+os.sep+options.modeling+os.sep+'tempModel_biggRxnid_locusTag_dict.p','rb'))
    tempModel_exrxnid_flux_dict = pickle.load(open(root+os.sep+options.modeling+os.sep+'tempModel_exrxnid_flux_dict.p','rb'))

    #Template model-independent pickles for model augmentation phase
    logging.debug("loading pickle files of the parsed template model and its relevant genbank data..")
    bigg_mnxr_dict = pickle.load(open(root+os.sep+'bigg_mnxr_dict.p','rb'))
    kegg_mnxr_dict = pickle.load(open(root+os.sep+'kegg_mnxr_dict.p','rb'))
    mnxr_kegg_dict = pickle.load(open(root+os.sep+'mnxr_kegg_dict.p','rb'))
    mnxr_rxn_dict = pickle.load(open(root+os.sep+'mnxr_rxn_dict.p','rb'))
    bigg_mnxm_compound_dict = pickle.load(open(root+os.sep+'bigg_mnxm_compound_dict.p','rb'))
    mnxm_bigg_compound_dict = pickle.load(open(root+os.sep+'mnxm_bigg_compound_dict.p','rb'))
    kegg_mnxm_compound_dict = pickle.load(open(root+os.sep+'kegg_mnxm_compound_dict.p','rb'))
    mnxm_kegg_compound_dict = pickle.load( open(root+os.sep+'mnxm_kegg_compound_dict.p','rb'))
    mnxm_compoundInfo_dict = pickle.load(open(root+os.sep+'mnxm_compoundInfo_dict.p','rb'))
    ###################################################################

    logging.debug("pruning phase starting..")
    ###################################################################

    logging.debug("reading genbank file of the target genome.."    )
    targetGenome_locusTag_ec_dict, targetGenome_locusTag_prod_dict, target_fasta = get_targetGenomeInfo(seq_records, options)

    if len(targetGenome_locusTag_ec_dict) == 0:
        logging.error("Error: no EC_number in sequence record; skipping modeling")
        return False

    logging.debug("generating a DB for the genes from the target genome..")
    make_blastDB(query_fasta=target_fasta, options=options)

    logging.debug("running BLASTP #1: genes in the target genome against genes in the template model..")
    run_blastp(target_fasta=options.metabolicmodeldir+os.sep+'targetGenome_locusTag_aaSeq.fa', \
               blastp_result=options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome.txt',\
               db_dir=root+os.sep+options.modeling+os.sep+'tempBlastDB', evalue=1e-30)

    logging.debug("running BLASTP #2: genes in the template model against genes in the target genome..")
    run_blastp(target_fasta=root+os.sep+options.modeling+os.sep+'tempModel_locusTag_aaSeq.fa', \
               blastp_result=options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome.txt',\
               db_dir = options.metabolicmodeldir+os.sep+'targetBlastDB', evalue=1e-30)

    logging.debug("parsing the results of BLASTP #1..")
    blastpResults_dict1 = parseBlaspResults(options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome.txt', \
                                            options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome_parsed.txt')

    logging.debug("parsing the results of BLASTP #2..")
    blastpResults_dict2 = parseBlaspResults(options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome.txt', \
                                            options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome_parsed.txt')

    logging.debug("selecting the best hits for BLASTP #1..")
    bestHits_dict1 = makeBestHits_dict(options.metabolicmodeldir+os.sep+'blastp_targetGenome_against_tempGenome_parsed.txt')

    logging.debug("selecting the best hits for BLASTP #2..")
    bestHits_dict2 = makeBestHits_dict(options.metabolicmodeldir+os.sep+'blastp_tempGenome_against_targetGenome_parsed.txt')

    logging.debug("selecting the bidirectional best hits..")
    targetBBH_list, temp_target_BBH_dict = getBBH(bestHits_dict1, bestHits_dict2)


    logging.debug("selecting genes that are not bidirectional best hits..")
    nonBBH_list = get_nonBBH(targetGenome_locusTag_ec_dict, targetBBH_list)
    ###################################################################

    ###################################################################
    logging.debug("labeling reactions with nonhomologous genes to remove from the template model..")
    rxnToRemove_dict = labelRxnToRemove(model, temp_target_BBH_dict, tempModel_biggRxnid_locusTag_dict)

    logging.debug("removing reactions with nonhomologous genes from the template model..")
    modelPruned, rxnToRemoveEssn_dict, rxnRemoved_dict, rxnRetained_dict = pruneModel(model, rxnToRemove_dict, options.automodel.solver)

    logging.debug("correcting GPR associations in the template model..")
    modelPrunedGPR = swap_locusTag_tempModel(modelPruned, temp_target_BBH_dict)
    ###################################################################


    logging.debug("augmentation phase starting..")
    ###################################################################
    logging.debug("creating various dictionary files for the nonBBH gene-associted reactions...")

    targetGenome_locusTag_ec_nonBBH_dict = get_targetGenome_locusTag_ec_nonBBH_dict(targetGenome_locusTag_ec_dict, nonBBH_list)

    rxnid_info_dict, rxnid_locusTag_dict = make_all_rxnInfo_fromRefSeq(targetGenome_locusTag_ec_nonBBH_dict, options)

    modelPrunedGPR_mnxr_list = get_mnxr_list_from_modelPrunedGPR(modelPrunedGPR, bigg_mnxr_dict)
    ###################################################################

    ###################################################################
    logging.debug("adding the nonBBH gene-associated reactions...")
    rxnid_to_add_list = check_existing_rxns(kegg_mnxr_dict, modelPrunedGPR_mnxr_list, rxnid_info_dict)

    mnxr_to_add_list = get_mnxr_using_kegg(rxnid_to_add_list, kegg_mnxr_dict)

    rxnid_mnxm_coeff_dict = extract_rxn_mnxm_coeff(mnxr_to_add_list, mnxr_rxn_dict, mnxm_bigg_compound_dict, mnxm_kegg_compound_dict, mnxr_kegg_dict)

    target_model = add_nonBBH_rxn(modelPrunedGPR, rxnid_info_dict, rxnid_mnxm_coeff_dict, rxnid_locusTag_dict, bigg_mnxm_compound_dict, kegg_mnxm_compound_dict, mnxm_compoundInfo_dict, targetGenome_locusTag_prod_dict, tempModel_exrxnid_flux_dict, options)
    ###################################################################

    #Output on screen
    model = pickle.load(open(root+os.sep+options.modeling+os.sep+'model.p','rb'))
    logging.debug("Number of genes in template and pruned models: %s / %s", len(model.genes), len(modelPruned.genes))
    logging.debug("Number of reactions in template and pruned models: %s / %s", len(model.reactions), len(modelPruned.reactions))
    logging.debug("Number of metabolites in template and pruned models: %s / %s", len(model.metabolites), len(modelPruned.metabolites))

    # Set up extrarecord data structure within options, if not already set
    if "extrarecord" not in options:
        options.extrarecord = {}

    # store model data in seq_records[0]
    seq_record = seq_records[0]
    if seq_record.id not in options.extrarecord:
        options.extrarecord[seq_record.id] = utils.Storage()
    if "extradata" not in options.extrarecord[seq_record.id]:
        options.extrarecord[seq_record.id].extradata = {}

    # as the cobra model object does not provide an own serialization, let's try with pickle...
    options.extrarecord[seq_record.id].extradata["MetabolicModelDataObj"] = pickle.dumps(target_model)

    if 'MetabolicModelDataObj' in options.extrarecord[seq_record.id].extradata:
        logging.debug("Generate options.extrarecord entry")
    else:
        logging.warning("Could not generate options.extrarecord for %s", seq_record.id)

    return True
コード例 #13
0
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations,
                              proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "knwonclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "knownclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                utils.writefasta(
                    [qcname.replace(" ", "_") for qcname in queryclusternames],
                    queryclusterseqs, "input.fasta")
                out, err, retcode = run_diamond(
                    "input.fasta",
                    path.join(options.knownclusterblastdir,
                              'knownclusterprots'), tempdir, options)
                if retcode != 0:
                    logging.debug("out: %r, err: %r, retcode: %s", out, err,
                                  retcode)
                convert_to_tabular(tempdir)
                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="knownclusters")
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                knownclusterblastStorage = utils.Storage()
                knownclusterblastStorage.clusternumber = clusternumber
                knownclusterblastStorage.queryclusterprots = queryclusterprots
                knownclusterblastStorage.clusters = clusters
                knownclusterblastStorage.hitclusterdata = hitclusterdata
                knownclusterblastStorage.rankedclusters = rankedclusters
                knownclusterblastStorage.rankedclustervalues = rankedclustervalues
                knownclusterblastStorage.proteintags = proteintags
                knownclusterblastStorage.proteinlocations = proteinlocations
                knownclusterblastStorage.proteinannotations = proteinannotations
                knownclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          knownclusterblastStorage,
                                          searchtype="knownclusters")