コード例 #1
0
ファイル: test_utils.py プロジェクト: chevrm/transPACT
    def test_get_gene_annotation(self):
        "Test utils.get_gene_annotation()"
        feature = FakeFeature("CDS")
        self.assertEqual('unannotated orf', utils.get_gene_annotation(feature))

        feature.qualifiers['product'] = ['fake']
        self.assertEqual('fake', utils.get_gene_annotation(feature))
コード例 #2
0
def write_gene(txt, info, options):
    "Write gene table to TXT"
    #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation
    txt.write("\t".join([
        "gene ID", "gene start", "gene end", "gene strand", "smCOG",
        "locus_tag", "annotation"
    ]) + "\n")
    for BGCnr in info.clusternrs:
        #Retrieve all data that will be written out
        cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr)
        cluster_gene_features = utils.get_cluster_cds_features(
            cluster_feature, info.seq_record)
        for cds in cluster_gene_features:
            gene_id = utils.get_gene_acc(cds).partition(".")[0]
            cds_start = str(cds.location.start)
            cds_end = str(cds.location.end)
            if cds.strand == 1:
                cds_strand = "+"
            else:
                cds_strand = "-"
            smCOG = ""  ##Not used for now
            locus_tag = utils.get_gene_id(cds).partition(".")[0]
            annotation = utils.get_gene_annotation(cds)
            txt.write("\t".join([
                gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag,
                annotation
            ]) + "\n")
コード例 #3
0
ファイル: clusterblast.py プロジェクト: chevrm/transPACT
def create_blast_inputs(genecluster, seq_record):
    #Create input fasta files for BLAST search
    queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.nofuzzy_start) + "-" + \
                             str(cds.location.nofuzzy_end), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
コード例 #4
0
ファイル: clusterblast.py プロジェクト: abner24/plantismash
def create_blast_inputs(genecluster, seq_record):
    options = config.get_config()
    #Create input fasta files for BLAST search
    if options.taxon == "plants":
        queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record))
    else:
        queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record)
    queryclusternames = []
    queryclusterseqs = []
    queryclusterprotsnames = []
    for cds in queryclusterprots:
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \
                             str(cds.location.start).replace(">","").replace("<","") + "-" + \
                             str(cds.location.end).replace(">","").replace("<",""), \
                             strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)])
        queryclusterseqs.append(str(utils.get_aa_sequence(cds)))
        queryclusternames.append(fullname)
        queryclusterprotsnames.append(utils.get_gene_acc(cds))

    return queryclusternames, queryclusterseqs, queryclusterprotsnames
コード例 #5
0
ファイル: clusterblast.py プロジェクト: chevrm/transPACT
def write_clusterblast_output(options,
                              seq_record,
                              clusterblastStorage,
                              searchtype="general"):

    clusternumber = clusterblastStorage.clusternumber
    queryclusterprots = clusterblastStorage.queryclusterprots
    clusters = clusterblastStorage.clusters
    ranking = clusterblastStorage.ranking
    proteins = clusterblastStorage.proteins

    #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters
    currentdir = os.getcwd()
    os.chdir(get_output_dir(options, searchtype))

    out_file = open("cluster" + str(clusternumber) + ".txt", "w")
    out_file.write("ClusterBlast scores for " + seq_record.id + "\n")
    out_file.write(
        "\nTable of genes, locations, strands and annotations of query cluster:\n"
    )
    feature_by_id = utils.get_feature_dict_protein_id(seq_record)
    for i in queryclusterprots:
        cds = feature_by_id[i]
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        out_file.write("\t".join([
            i,
            str(cds.location.nofuzzy_start),
            str(cds.location.nofuzzy_end), strand,
            utils.get_gene_annotation(cds)
        ]) + "\t\n")
    out_file.write("\n\nSignificant hits: \n")
    top_hits = ranking[:100]
    for n, cluster_and_result in enumerate(top_hits):
        cluster = cluster_and_result[0]
        out_file.write("{}. {}\t{}\n".format(n + 1, cluster,
                                             clusters[cluster][1]))

    out_file.write("\n\nDetails:")
    for n, cluster_and_result in enumerate(top_hits):
        cluster, result = cluster_and_result
        # TODO: change to just result.hits during next minor version bump
        nrhits = result.hits + result.synteny_score + result.core_bonus
        out_file.write("\n\n>>\n")
        out_file.write("{}. {}\n".format(n + 1, cluster))
        out_file.write("Source: {}\n".format(clusters[cluster][1]))
        out_file.write("Type: {}\n".format(clusters[cluster][2]))
        out_file.write(
            "Number of proteins with BLAST hits to this cluster: %d\n" %
            nrhits)
        out_file.write("Cumulative BLAST score: %d\n\n" % result.blast_score)
        out_file.write(
            "Table of genes, locations, strands and annotations of subject cluster:\n"
        )
        clusterproteins = clusters[cluster][0]
        for protein_name in clusterproteins:
            protein = proteins.get(protein_name)
            if protein:
                out_file.write(str(protein))
        out_file.write(
            "\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n"
        )
        if result.scored_pairings:
            for query, subject in result.scored_pairings:
                # TODO : check the trailing \t is meaningful
                out_file.write("{}\t{}\t\n".format(query.id,
                                                   subject.get_table_string()))
        else:
            out_file.write("data not found\n")
        out_file.write("\n")
    out_file.close()
    os.chdir(currentdir)
コード例 #6
0
ファイル: clusterblast.py プロジェクト: abner24/plantismash
def write_clusterblast_output(options, seq_record,clusterblastStorage, searchtype="general"):

    clusternumber = clusterblastStorage.clusternumber
    queryclusterprots = clusterblastStorage.queryclusterprots
    clusters = clusterblastStorage.clusters
    hitclusterdata = clusterblastStorage.hitclusterdata
    rankedclusters = clusterblastStorage.rankedclusters
    rankedclustervalues = clusterblastStorage.rankedclustervalues
    proteintags = clusterblastStorage.proteintags
    proteinlocations = clusterblastStorage.proteinlocations
    proteinannotations = clusterblastStorage.proteinannotations
    proteinstrands = clusterblastStorage.proteinstrands

    #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters
    logging.info("   Writing output file...")
    currentdir = os.getcwd()
    if searchtype == "general":
        options.clusterblast_outputfolder = options.full_outputfolder_path + os.sep + "clusterblast"
        if not os.path.exists(options.clusterblast_outputfolder):
            os.mkdir(options.clusterblast_outputfolder)
        outputfolder = options.clusterblast_outputfolder
    elif searchtype == "subclusters":
        options.subclusterblast_outputfolder = options.full_outputfolder_path + os.sep + "subclusterblast"
        if not os.path.exists(options.subclusterblast_outputfolder):
            os.mkdir(options.subclusterblast_outputfolder)
        outputfolder = options.subclusterblast_outputfolder
    elif searchtype == "knownclusters":
        options.knownclusterblast_outputfolder = options.full_outputfolder_path + os.sep + "knownclusterblast"
        if not os.path.exists(options.knownclusterblast_outputfolder):
            os.mkdir(options.knownclusterblast_outputfolder)
        outputfolder = options.knownclusterblast_outputfolder
    os.chdir(outputfolder)
    out_file = open("cluster" + str(clusternumber) + ".txt","w")
    out_file.write("ClusterBlast scores for " + seq_record.id + "\n")
    out_file.write("\nTable of genes, locations, strands and annotations of query cluster:\n")
    feature_by_id = utils.get_feature_dict_protein_id(seq_record)
    for i in queryclusterprots:
        cds = feature_by_id[i]
        if cds.strand == 1:
            strand = "+"
        else:
            strand = "-"
        out_file.write("\t".join([i, str(cds.location.start).replace(">","").replace("<",""), str(cds.location.end).replace(">","").replace("<",""), strand, utils.get_gene_annotation(cds)]) + "\t\n")
    out_file.write("\n\nSignificant hits: \n")
    z = 0
    for i in rankedclusters[:100]:
        out_file.write(str(z+1) + ". " + i + "\t" + clusters[i][1] + "\n")
        z += 1
    z = 0
    out_file.write("\n\nDetails:")
    for i in rankedclusters[:100]:
        value = "%.8f" % rankedclustervalues[z]
        nrhits = value.split(".")[0]
        if nrhits > 0:
            out_file.write("\n\n>>\n")
            cumblastscore = str(int(float(value.split(".")[1][2:])))
            out_file.write("\n".join([str(z+1) + ". " + i, "Source: " + clusters[i][1], "Type: " + clusters[i][2], "Number of proteins with BLAST hits to this cluster: " + nrhits,"Cumulative BLAST score: " + cumblastscore + "\n", "Table of genes, locations, strands and annotations of subject cluster:\n"]))
            clusterproteins = clusters[i][0]
            for j in clusterproteins:
                if proteinlocations.has_key(j) and proteinannotations.has_key(j) and proteinstrands.has_key(j):
                    if proteintags[j] == "no_locus_tag":
                        out_file.write(j + "\t")
                    else:
                        out_file.write(proteintags[j] + "\t")
                    out_file.write("\t".join([j, proteinlocations[j].split("-")[0], proteinlocations[j].split("-")[1], proteinstrands[j], proteinannotations[j]]) + "\n")
            out_file.write("\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n")
            if i in hitclusterdata.keys():
                tabledata = hitclusterdata[i]
                for x in tabledata:
                    w = 0
                    for y in x:
                        if w == 0:
                            out_file.write(str(y).split("|")[4] + "\t")
                            w += 1
                        else:
                            out_file.write(str(y) + "\t")
                    out_file.write("\n")
            else:
                out_file.write("data not found\n")
            out_file.write("\n")
            z += 1
    out_file.close()
    os.chdir(currentdir)