def test_get_gene_annotation(self): "Test utils.get_gene_annotation()" feature = FakeFeature("CDS") self.assertEqual('unannotated orf', utils.get_gene_annotation(feature)) feature.qualifiers['product'] = ['fake'] self.assertEqual('fake', utils.get_gene_annotation(feature))
def write_gene(txt, info, options): "Write gene table to TXT" #TXT columns: gene ID, gene start, gene end, gene strand, smCOG, locus_tag/geneID, annotation txt.write("\t".join([ "gene ID", "gene start", "gene end", "gene strand", "smCOG", "locus_tag", "annotation" ]) + "\n") for BGCnr in info.clusternrs: #Retrieve all data that will be written out cluster_feature = utils.get_cluster_by_nr(info.seq_record, BGCnr) cluster_gene_features = utils.get_cluster_cds_features( cluster_feature, info.seq_record) for cds in cluster_gene_features: gene_id = utils.get_gene_acc(cds).partition(".")[0] cds_start = str(cds.location.start) cds_end = str(cds.location.end) if cds.strand == 1: cds_strand = "+" else: cds_strand = "-" smCOG = "" ##Not used for now locus_tag = utils.get_gene_id(cds).partition(".")[0] annotation = utils.get_gene_annotation(cds) txt.write("\t".join([ gene_id, cds_start, cds_end, cds_strand, smCOG, locus_tag, annotation ]) + "\n")
def create_blast_inputs(genecluster, seq_record): #Create input fasta files for BLAST search queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.nofuzzy_start) + "-" + \ str(cds.location.nofuzzy_end), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def create_blast_inputs(genecluster, seq_record): options = config.get_config() #Create input fasta files for BLAST search if options.taxon == "plants": queryclusterprots = filter_overlap(utils.get_cluster_cds_features(genecluster, seq_record)) else: queryclusterprots = utils.get_cluster_cds_features(genecluster, seq_record) queryclusternames = [] queryclusterseqs = [] queryclusterprotsnames = [] for cds in queryclusterprots: if cds.strand == 1: strand = "+" else: strand = "-" fullname = "|".join(["input", "c" + str(utils.get_cluster_number(genecluster)), \ str(cds.location.start).replace(">","").replace("<","") + "-" + \ str(cds.location.end).replace(">","").replace("<",""), \ strand, utils.get_gene_acc(cds), utils.get_gene_annotation(cds)]) queryclusterseqs.append(str(utils.get_aa_sequence(cds))) queryclusternames.append(fullname) queryclusterprotsnames.append(utils.get_gene_acc(cds)) return queryclusternames, queryclusterseqs, queryclusterprotsnames
def write_clusterblast_output(options, seq_record, clusterblastStorage, searchtype="general"): clusternumber = clusterblastStorage.clusternumber queryclusterprots = clusterblastStorage.queryclusterprots clusters = clusterblastStorage.clusters ranking = clusterblastStorage.ranking proteins = clusterblastStorage.proteins #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters currentdir = os.getcwd() os.chdir(get_output_dir(options, searchtype)) out_file = open("cluster" + str(clusternumber) + ".txt", "w") out_file.write("ClusterBlast scores for " + seq_record.id + "\n") out_file.write( "\nTable of genes, locations, strands and annotations of query cluster:\n" ) feature_by_id = utils.get_feature_dict_protein_id(seq_record) for i in queryclusterprots: cds = feature_by_id[i] if cds.strand == 1: strand = "+" else: strand = "-" out_file.write("\t".join([ i, str(cds.location.nofuzzy_start), str(cds.location.nofuzzy_end), strand, utils.get_gene_annotation(cds) ]) + "\t\n") out_file.write("\n\nSignificant hits: \n") top_hits = ranking[:100] for n, cluster_and_result in enumerate(top_hits): cluster = cluster_and_result[0] out_file.write("{}. {}\t{}\n".format(n + 1, cluster, clusters[cluster][1])) out_file.write("\n\nDetails:") for n, cluster_and_result in enumerate(top_hits): cluster, result = cluster_and_result # TODO: change to just result.hits during next minor version bump nrhits = result.hits + result.synteny_score + result.core_bonus out_file.write("\n\n>>\n") out_file.write("{}. {}\n".format(n + 1, cluster)) out_file.write("Source: {}\n".format(clusters[cluster][1])) out_file.write("Type: {}\n".format(clusters[cluster][2])) out_file.write( "Number of proteins with BLAST hits to this cluster: %d\n" % nrhits) out_file.write("Cumulative BLAST score: %d\n\n" % result.blast_score) out_file.write( "Table of genes, locations, strands and annotations of subject cluster:\n" ) clusterproteins = clusters[cluster][0] for protein_name in clusterproteins: protein = proteins.get(protein_name) if protein: out_file.write(str(protein)) out_file.write( "\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n" ) if result.scored_pairings: for query, subject in result.scored_pairings: # TODO : check the trailing \t is meaningful out_file.write("{}\t{}\t\n".format(query.id, subject.get_table_string())) else: out_file.write("data not found\n") out_file.write("\n") out_file.close() os.chdir(currentdir)
def write_clusterblast_output(options, seq_record,clusterblastStorage, searchtype="general"): clusternumber = clusterblastStorage.clusternumber queryclusterprots = clusterblastStorage.queryclusterprots clusters = clusterblastStorage.clusters hitclusterdata = clusterblastStorage.hitclusterdata rankedclusters = clusterblastStorage.rankedclusters rankedclustervalues = clusterblastStorage.rankedclustervalues proteintags = clusterblastStorage.proteintags proteinlocations = clusterblastStorage.proteinlocations proteinannotations = clusterblastStorage.proteinannotations proteinstrands = clusterblastStorage.proteinstrands #Output for each hit: table of genes and locations of input cluster, table of genes and locations of hit cluster, table of hits between the clusters logging.info(" Writing output file...") currentdir = os.getcwd() if searchtype == "general": options.clusterblast_outputfolder = options.full_outputfolder_path + os.sep + "clusterblast" if not os.path.exists(options.clusterblast_outputfolder): os.mkdir(options.clusterblast_outputfolder) outputfolder = options.clusterblast_outputfolder elif searchtype == "subclusters": options.subclusterblast_outputfolder = options.full_outputfolder_path + os.sep + "subclusterblast" if not os.path.exists(options.subclusterblast_outputfolder): os.mkdir(options.subclusterblast_outputfolder) outputfolder = options.subclusterblast_outputfolder elif searchtype == "knownclusters": options.knownclusterblast_outputfolder = options.full_outputfolder_path + os.sep + "knownclusterblast" if not os.path.exists(options.knownclusterblast_outputfolder): os.mkdir(options.knownclusterblast_outputfolder) outputfolder = options.knownclusterblast_outputfolder os.chdir(outputfolder) out_file = open("cluster" + str(clusternumber) + ".txt","w") out_file.write("ClusterBlast scores for " + seq_record.id + "\n") out_file.write("\nTable of genes, locations, strands and annotations of query cluster:\n") feature_by_id = utils.get_feature_dict_protein_id(seq_record) for i in queryclusterprots: cds = feature_by_id[i] if cds.strand == 1: strand = "+" else: strand = "-" out_file.write("\t".join([i, str(cds.location.start).replace(">","").replace("<",""), str(cds.location.end).replace(">","").replace("<",""), strand, utils.get_gene_annotation(cds)]) + "\t\n") out_file.write("\n\nSignificant hits: \n") z = 0 for i in rankedclusters[:100]: out_file.write(str(z+1) + ". " + i + "\t" + clusters[i][1] + "\n") z += 1 z = 0 out_file.write("\n\nDetails:") for i in rankedclusters[:100]: value = "%.8f" % rankedclustervalues[z] nrhits = value.split(".")[0] if nrhits > 0: out_file.write("\n\n>>\n") cumblastscore = str(int(float(value.split(".")[1][2:]))) out_file.write("\n".join([str(z+1) + ". " + i, "Source: " + clusters[i][1], "Type: " + clusters[i][2], "Number of proteins with BLAST hits to this cluster: " + nrhits,"Cumulative BLAST score: " + cumblastscore + "\n", "Table of genes, locations, strands and annotations of subject cluster:\n"])) clusterproteins = clusters[i][0] for j in clusterproteins: if proteinlocations.has_key(j) and proteinannotations.has_key(j) and proteinstrands.has_key(j): if proteintags[j] == "no_locus_tag": out_file.write(j + "\t") else: out_file.write(proteintags[j] + "\t") out_file.write("\t".join([j, proteinlocations[j].split("-")[0], proteinlocations[j].split("-")[1], proteinstrands[j], proteinannotations[j]]) + "\n") out_file.write("\nTable of Blast hits (query gene, subject gene, %identity, blast score, %coverage, e-value):\n") if i in hitclusterdata.keys(): tabledata = hitclusterdata[i] for x in tabledata: w = 0 for y in x: if w == 0: out_file.write(str(y).split("|")[4] + "\t") w += 1 else: out_file.write(str(y) + "\t") out_file.write("\n") else: out_file.write("data not found\n") out_file.write("\n") z += 1 out_file.close() os.chdir(currentdir)