Exemple #1
0
    def test_get_sorted_cluster_features(self):
        "Test utils.get_sorted_cluster_features()"
        res = utils.get_sorted_cluster_features(self.record)
        self.assertEqual([self.features[0], self.features[-1]], res)

        # remove both cluster records
        self.features.pop(0)
        self.features.pop()
        self.assertEqual([], utils.get_sorted_cluster_features(self.record))
Exemple #2
0
def load_genecluster_info(seq_record, options, searchtype="general"):
    #Gather and store data on each gene cluster
    smcogdict, smcogdescriptions = utils.get_smcog_annotations(seq_record)
    gtrcoglist = ['SMCOG1045', 'SMCOG1062', 'SMCOG1102']
    transportercoglist = [
        'SMCOG1000', 'SMCOG1005', 'SMCOG1011', 'SMCOG1020', 'SMCOG1029',
        'SMCOG1033', 'SMCOG1035', 'SMCOG1044', 'SMCOG1065', 'SMCOG1067',
        'SMCOG1069', 'SMCOG1074', 'SMCOG1085', 'SMCOG1096', 'SMCOG1106',
        'SMCOG1118', 'SMCOG1131', 'SMCOG1166', 'SMCOG1169', 'SMCOG1184',
        'SMCOG1202', 'SMCOG1205', 'SMCOG1214', 'SMCOG1234', 'SMCOG1243',
        'SMCOG1245', 'SMCOG1252', 'SMCOG1254', 'SMCOG1288'
    ]
    seq_record.qgeneclusterdata = {}
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    for genecluster in geneclusters:
        geneclusternr = utils.get_cluster_number(genecluster)
        clustergenes, clustertype, annotations, colors, starts, ends, strands, pksnrpsprots, gtrs, transporters, clustersize = retrieve_gene_cluster_annotations(
            seq_record, smcogdict, gtrcoglist, transportercoglist,
            geneclusternr)
        if options.clusterblast:
            hitgeneclusterdata = retrieve_clusterblast_info(
                seq_record, geneclusternr, searchtype=searchtype)
        else:
            hitgeneclusterdata = {}
        pksnrpsprotsnames, pksnrpsdomains, domlist, domsdetails, substrspecnrpspredictordict, substrspecminowadict, substrspecpkssigdict, substrspecconsensusdict, krpredictionsdict, structpred = retrieve_pksnrps_info(
            seq_record, geneclusternr, pksnrpsprots)
        seq_record.qgeneclusterdata[geneclusternr] = [
            clustertype, clustersize, clustergenes, annotations, starts, ends,
            strands, pksnrpsprots, pksnrpsprotsnames, pksnrpsdomains,
            substrspecnrpspredictordict, substrspecminowadict,
            substrspecpkssigdict, substrspecconsensusdict, gtrs, transporters,
            colors, hitgeneclusterdata, structpred, krpredictionsdict
        ]
def perform_subclusterblast(options, seq_record, clusters, proteinlocations,
                            proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running NCBI BLAST+ subcluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True):
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "subclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "subclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:
                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                write_clusterblast_inputfiles(options, queryclusternames,
                                              queryclusterseqs)
                run_clusterblast_processes(options, searchtype="subclusters")
                blastoutput = read_clusterblast_output(options)
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="subclusters")
                logging.info("   Blast search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_acc(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                subclusterblastStorage = utils.Storage()
                subclusterblastStorage.clusternumber = clusternumber
                subclusterblastStorage.queryclusterprots = queryclusterprots
                subclusterblastStorage.clusters = clusters
                subclusterblastStorage.hitclusterdata = hitclusterdata
                subclusterblastStorage.rankedclusters = rankedclusters
                subclusterblastStorage.rankedclustervalues = rankedclustervalues
                subclusterblastStorage.proteintags = proteintags
                subclusterblastStorage.proteinlocations = proteinlocations
                subclusterblastStorage.proteinannotations = proteinannotations
                subclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          subclusterblastStorage,
                                          searchtype="subclusters")
Exemple #4
0
def load_clusterblast_outputdata(seq_record, options):
    #Read in ClusterBlast data
    seq_record.queryclusterdata = {}
    seq_record.nrhitgeneclusters = {}
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    for genecluster in geneclusters:
        clusternr = utils.get_cluster_number(genecluster)
        details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict = read_clusterblastfile(seq_record, options, clusternr)
        parse_clusterblast_details(options, seq_record, clusternr, details, toptenhitclusters, nrhitclusters, queryclustergenes, queryclustergenesdetails, cb_accessiondict)
        genecluster.qualifiers['clusterblast'] = toptenhitclusters
Exemple #5
0
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND gene cluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"):
                logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast"  + os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record)
                utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta")
                if options.taxon == "plants":
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options)
                else:
                    out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options)
                if retcode != 0:
                    logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)
                out, err, retcode = convert_to_tabular(tempdir)
                if retcode != 0:
                    logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out)

                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()

                write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput)
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 10
                minpercidentity = 30
                blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object
                clusterblastStorage = utils.Storage()
                clusterblastStorage.clusternumber = clusternumber
                clusterblastStorage.queryclusterprots = queryclusterprots
                clusterblastStorage.clusters = clusters
                clusterblastStorage.hitclusterdata = hitclusterdata
                clusterblastStorage.rankedclusters = rankedclusters
                clusterblastStorage.rankedclustervalues = rankedclustervalues
                clusterblastStorage.proteintags = proteintags
                clusterblastStorage.proteinlocations = proteinlocations
                clusterblastStorage.proteinannotations = proteinannotations
                clusterblastStorage.proteinstrands = proteinstrands


                #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands)
                write_clusterblast_output(options, seq_record, clusterblastStorage)
Exemple #6
0
def internal_homology_blast(seq_record):
    options = config.get_config()
    #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
    with TemporaryDirectory(change=True):
        logging.info("Finding internal homologs in each gene cluster..")
        internalhomologygroupsdict = {}
        geneclusters = utils.get_sorted_cluster_features(seq_record)
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            iqueryclusternames, iqueryclusterseqs, iqueryclusterprots = create_blast_inputs(genecluster, seq_record)
            utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta")
            blastoutput = run_internal_blastsearch()
            iblastdict, iquerylist, ihitclusters = parse_blast(blastoutput, seq_record, 25, 30)
            internalhomologygroupsdict = find_internal_orthologous_groups(internalhomologygroupsdict, iblastdict, iqueryclusternames, clusternumber)
    return internalhomologygroupsdict
Exemple #7
0
def internal_homology_blast(seq_record):
    #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly
    with TemporaryDirectory(change=True):
        logging.debug("Finding internal homologs in each gene cluster..")
        internalhomologygroups = {}
        geneclusters = utils.get_sorted_cluster_features(seq_record)
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            iqueryclusternames, iqueryclusterseqs, _ = create_blast_inputs(
                genecluster, seq_record)
            utils.writefasta(iqueryclusternames, iqueryclusterseqs,
                             "internal_input.fasta")
            blastoutput = run_internal_blastsearch()
            queries, _ = blastparse(blastoutput, 25, 30, seq_record)
            groups = find_internal_orthologous_groups(queries,
                                                      iqueryclusternames)
            internalhomologygroups[clusternumber] = groups
    return internalhomologygroups
Exemple #8
0
def perform_knownclusterblast(options, seq_record, clusters, proteins):
    # Run BLAST on gene cluster proteins of each cluster and parse output
    logging.debug("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)

    all_names, all_seqs, all_prots = [], [], []
    prots_by_cluster = []
    for genecluster in geneclusters:
        names, seqs, prots = clusterblast.create_blast_inputs(
            genecluster, seq_record)
        all_names.extend(names)
        all_seqs.extend(seqs)
        all_prots.extend(prots)
        prots_by_cluster.append(prots)

    debug_path = os.path.join(options.dbgclusterblast,
                              "knownclusterblastoutput.txt")
    if options.dbgclusterblast and os.path.exists(debug_path):
        logging.debug("Skipping DIAMOND calculations, using previous results")
        with open(debug_path, "r") as fh:
            blastoutput = fh.read()
    else:
        with TemporaryDirectory(change=True) as tempdir:
            utils.writefasta(
                [qcname.replace(" ", "_") for qcname in all_names], all_seqs,
                "input.fasta")
            out, err, retcode = clusterblast.run_diamond(
                "input.fasta",
                os.path.join(options.knownclusterblastdir,
                             'knownclusterprots'), tempdir, options)
            if retcode != 0:
                logging.debug("out: %r, err: %r, retcode: %s", out, err,
                              retcode)
            with open("input.out", 'r') as fh:
                blastoutput = fh.read()
            clusterblast.write_raw_clusterblastoutput(
                options.full_outputfolder_path,
                blastoutput,
                searchtype="knownclusters")

    minseqcoverage = 40
    minpercidentity = 45
    clusters_by_number, _ = clusterblast.parse_all_clusters(
        blastoutput, minseqcoverage, minpercidentity, seq_record)

    knownclusterblastStorage = utils.Storage()
    knownclusterblastStorage.clusters = clusters
    knownclusterblastStorage.proteins = proteins

    for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster):
        clusternumber = utils.get_cluster_number(genecluster)
        cluster_names_to_queries = clusters_by_number.get(clusternumber, {})
        allcoregenes = [
            utils.get_gene_id(cds)
            for cds in utils.get_secmet_cds_features(seq_record)
        ]
        ranking = clusterblast.score_clusterblast_output(
            clusters, allcoregenes, cluster_names_to_queries)

        # store all clusterblast related data in a utils.Storage object and serialize it
        knownclusterblastStorage.clusternumber = clusternumber
        knownclusterblastStorage.queryclusterprots = queryclusterprots
        knownclusterblastStorage.ranking = ranking
        clusterblast.write_clusterblast_output(options,
                                               seq_record,
                                               knownclusterblastStorage,
                                               searchtype="knownclusters")

    mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters,
                           options)
Exemple #9
0
def perform_clusterblast(options, seq_record, clusters, proteins):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    debug_path = os.path.abspath(
        os.path.join(options.dbgclusterblast, "clusterblastoutput.txt"))
    with TemporaryDirectory(change=True) as tempdir:
        all_names, all_seqs, all_prots = [], [], []
        prots_by_cluster = []
        for genecluster in geneclusters:
            names, seqs, prots = create_blast_inputs(genecluster, seq_record)
            all_names.extend(names)
            all_seqs.extend(seqs)
            all_prots.extend(prots)
            prots_by_cluster.append(prots)
        if options.dbgclusterblast and os.path.exists(debug_path):
            logging.debug(
                "Skipping DIAMOND calculations, using results from %s instead",
                debug_path)
            with open(debug_path, "r") as fh:
                blastoutput = fh.read()
            logging.debug("    Parsing results from given file...")
        else:
            logging.debug("Running DIAMOND gene cluster search..")
            utils.writefasta(all_names, all_seqs, "input.fasta")
            out, err, retcode = run_diamond(
                "input.fasta",
                path.join(options.clusterblastdir, "geneclusterprots"),
                tempdir, options)
            if retcode != 0:
                logging.error(
                    "Running diamond failed: returned %s, stderr: %r, stdout: %r",
                    retcode, err, out)
            logging.debug("   DIAMOND search finished. Parsing results...")

            with open("input.out", 'r') as fh:
                blastoutput = fh.read()

        write_raw_clusterblastoutput(options.full_outputfolder_path,
                                     blastoutput)

        minseqcoverage = 10
        minpercidentity = 30
        clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage,
                                                   minpercidentity, seq_record)

        clusterblastStorage = utils.Storage()
        clusterblastStorage.clusters = clusters
        clusterblastStorage.proteins = proteins

        for genecluster, queryclusterprots in zip(geneclusters,
                                                  prots_by_cluster):
            clusternumber = utils.get_cluster_number(genecluster)
            cluster_names_to_queries = clusters_by_number.get(
                clusternumber, {})
            allcoregenes = [
                utils.get_gene_acc(cds)
                for cds in utils.get_secmet_cds_features(seq_record)
            ]
            ranking = score_clusterblast_output(clusters, allcoregenes,
                                                cluster_names_to_queries)

            # store all clusterblast related data in a utils.Storage object
            clusterblastStorage.clusternumber = clusternumber
            clusterblastStorage.queryclusterprots = queryclusterprots
            clusterblastStorage.ranking = ranking

            write_clusterblast_output(options, seq_record, clusterblastStorage)
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations,
                              proteinstrands, proteinannotations, proteintags):
    #Run BLAST on gene cluster proteins of each cluster and parse output
    logging.info("Running DIAMOND knowncluster searches..")
    geneclusters = utils.get_sorted_cluster_features(seq_record)
    with TemporaryDirectory(change=True) as tempdir:
        for genecluster in geneclusters:
            clusternumber = utils.get_cluster_number(genecluster)
            if options.debug and os.path.exists(options.dbgclusterblast +
                                                os.sep + "knwonclusterblast" +
                                                os.sep + "cluster" +
                                                str(clusternumber) + ".txt"):
                logging.debug(
                    "Skipping SubClusterblast calculations, using results from %s instead"
                    % options.dbgclusterblast + os.sep + "knownclusterblast" +
                    os.sep + "cluster" + str(clusternumber) + ".txt")
            else:

                logging.info("   Gene cluster " + str(clusternumber))
                queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(
                    genecluster, seq_record)
                utils.writefasta(
                    [qcname.replace(" ", "_") for qcname in queryclusternames],
                    queryclusterseqs, "input.fasta")
                out, err, retcode = run_diamond(
                    "input.fasta",
                    path.join(options.knownclusterblastdir,
                              'knownclusterprots'), tempdir, options)
                if retcode != 0:
                    logging.debug("out: %r, err: %r, retcode: %s", out, err,
                                  retcode)
                convert_to_tabular(tempdir)
                with open("input.out", 'r') as fh:
                    blastoutput = fh.read()
                write_raw_clusterblastoutput(options.full_outputfolder_path,
                                             blastoutput,
                                             searchtype="knownclusters")
                logging.info("   DIAMOND search finished. Parsing results...")
                minseqcoverage = 40
                minpercidentity = 45
                blastdict, querylist, hitclusters = parse_blast(
                    blastoutput, seq_record, minseqcoverage, minpercidentity)
                querylist = remove_queries_without_hits(querylist, blastdict)
                allcoregenes = [
                    utils.get_gene_id(cds)
                    for cds in utils.get_secmet_cds_features(seq_record)
                ]
                rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(
                    blastdict, querylist, hitclusters, clusters, allcoregenes)

                # store all clusterblast related data in a utils.Storage object and serialize it
                knownclusterblastStorage = utils.Storage()
                knownclusterblastStorage.clusternumber = clusternumber
                knownclusterblastStorage.queryclusterprots = queryclusterprots
                knownclusterblastStorage.clusters = clusters
                knownclusterblastStorage.hitclusterdata = hitclusterdata
                knownclusterblastStorage.rankedclusters = rankedclusters
                knownclusterblastStorage.rankedclustervalues = rankedclustervalues
                knownclusterblastStorage.proteintags = proteintags
                knownclusterblastStorage.proteinlocations = proteinlocations
                knownclusterblastStorage.proteinannotations = proteinannotations
                knownclusterblastStorage.proteinstrands = proteinstrands

                write_clusterblast_output(options,
                                          seq_record,
                                          knownclusterblastStorage,
                                          searchtype="knownclusters")