def __init__(self, query, database, protein=False, formatdb=False, best_hit_only=True): import os from Bio import SeqRecord, SeqIO #print type(query) if type(query) == list or isinstance(query, SeqRecord.SeqRecord): from io import StringIO from tempfile import NamedTemporaryFile temp_query = NamedTemporaryFile(delete=False, mode="w") fastastr = StringIO() SeqIO.write(query, fastastr, 'fasta') temp_query.write(fastastr.getvalue()) temp_query.flush() self.query = temp_query.name # add content to temporary file elif type(query) == str: self.query = query self.query = query else: raise TypeError('wrong inut format: either SeqRecord or string') if type(database) == list or isinstance(database, SeqRecord.SeqRecord): from io import StringIO from tempfile import NamedTemporaryFile temp_db = NamedTemporaryFile(delete=False, mode="w") fastastrdb = StringIO() SeqIO.write(database, fastastrdb, 'fasta') temp_db.write(fastastrdb.getvalue()) temp_db.flush() self.database = temp_db.name # add content to temporary file elif type(database) == str or type(database) == unicode: self.database = database else: raise TypeError('wrong inut format: either SeqRecord or string') self.protein = protein self.best_hit_only = best_hit_only self.formatdb = formatdb self.working_dir = os.getcwd() self.blast_path_var = 'export BLASTDB=/tmp/' from chlamdb.biosqldb import shell_command shell_command.shell_command(self.blast_path_var)
def format_database(self): from chlamdb.biosqldb import shell_command new_database = self.id_generator(8) if self.protein: #print 'proteins' #cmd = 'formatdb -i %s -t %s -o T -p T -n /tmp/%s.temp -b T' % (self.database, new_database, new_database) cmd = 'formatdb -i %s -p T' % (self.database) #print cmd shell_command.shell_command(cmd) else: #print 'nucl' cmd = 'formatdb -i %s -p F' % (self.database) #print cmd shell_command.shell_command(cmd) self.database_path = '/tmp/%s.temp' % new_database
def run_prodigal(fasta_seq, output_name='temp.faa'): from Bio import SeqIO from chlamdb.biosqldb import shell_command from io import StringIO from tempfile import NamedTemporaryFile # -q quiet # -a Write protein translations to the selected file # -i Specify input file # -c: Closed ends. Do not allow genes to run off edges. # not activated cmd = "prodigal -q -a %s -i %s" % (output_name, fasta_seq) sdt_out, sdt_err, err = shell_command.shell_command(cmd) print(sdt_out) print(sdt_err) shell_command.shell_command('sed -i "s/*//g" %s' % output_name) #print sdt_out, sdt_err, err #shell_command.shell_command("seqret -sequence %s -feature -fformat gff -fopenfile temp.gff -osformat genbank -auto -outseq temp.gbk" % fasta_seq) #print sdt_out #fasta_file = NamedTemporaryFile() #fasta = open("temp.faa", 'w') #fasta.write(sdt_out) #print "genbank", genbank #for i in genbank: # print "record", i #test = open("test.gbk", 'w') #test.write(sdt_out) #for i in genbank: # print i #records = [i for i in genbank] #print records #SeqIO.write(genbank, fasta, "fasta") #fasta.close() #fasta_file.flush() return output_name
def plot_multiple_regions_crosslink(target_protein_list, region_record_list, plasmid_list, out_name, biodb_name="chlamydia_03_15", color_locus_list=[], flip_record_based_on_first=True, color_orthogroup_list=[]): import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl import MySQLdb import os sqlpsw = os.environ['SQLPSW'] norm = mpl.colors.Normalize(vmin=-30, vmax=100) cmap = cm.Blues m = cm.ScalarMappable(norm=norm, cmap=cmap) conn = MySQLdb.connect( host="127.0.0.1", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="orth_%s" % biodb_name) # name of the data base cursor = conn.cursor() gd_diagram = GenomeDiagram.Diagram("geomic_region") feature_sets = [] max_len = 0 records = dict((rec.name, rec) for rec in region_record_list) n_records = len(region_record_list) record_length = [len(record) for record in region_record_list] if flip_record_based_on_first: region_record_list_flip = [region_record_list[0]] region_record_list_flip[0].name = region_record_list_flip[ 0].description for x in range(0, len(region_record_list) - 1): same_strand_count = 0 different_strand_count = 0 features_X = region_record_list[x].features features_Y = region_record_list[x + 1].features for feature_1 in features_X: if feature_1.type != "CDS": continue for feature_2 in features_Y: if feature_2.type != "CDS": continue try: group1 = feature_1.qualifiers["orthogroup"][0] group2 = feature_2.qualifiers["orthogroup"][0] if group1 == group2: strand1 = feature_1.location.strand strand2 = feature_2.location.strand if strand1 == strand2: same_strand_count += 1 else: different_strand_count += 1 except: pass if different_strand_count > same_strand_count: region_record_list[x + 1] = region_record_list[ x + 1].reverse_complement( id=region_record_list[x + 1].id, name=region_record_list[x + 1].description) else: region_record_list[x + 1].name = region_record_list[x + 1].description #region_record_list = region_record_list_flip for i, record in enumerate(region_record_list): max_len = max(max_len, len(record)) #Allocate tracks 3 (top), 1 (bottom) for region 1 and 2 #(empty tracks 2 useful white space to emphasise the cross links #and also serve to make the tracks vertically more compressed) gd_track_for_features = gd_diagram.new_track( (1 * n_records - 1) - 1 * i, name=record.name, greytrack=True, height=0.4, start=0, end=len(record)) if record.name not in feature_sets: feature_sets.append(gd_track_for_features.new_set()) else: print("already in feature_sets!") print(record) quit #print 'looping....' for x in range(0, len(region_record_list) - 1): features_X = region_record_list[x].features features_Y = region_record_list[x + 1].features set_X = feature_sets[x] set_Y = feature_sets[x + 1] for feature_1 in features_X: if feature_1.type != "CDS": continue for feature_2 in features_Y: if feature_2.type != "CDS": continue try: group1 = feature_1.qualifiers["orthogroup"][0] group2 = feature_2.qualifiers["orthogroup"][0] except: group1 = "one_singleton" group2 = "two_singleton" if group1 == group2: border = colors.lightgrey color = colors.lightgrey try: identity = orthogroup_identity_db.check_identity( cursor, feature_1.qualifiers["orthogroup"][0], feature_1.qualifiers["locus_tag"][0], feature_2.qualifiers["locus_tag"][0]) except: identity = 0 print( "problem with identity table %s and locus %s %s" % (group1, feature_1.qualifiers["locus_tag"][0], feature_1.qualifiers["locus_tag"][0])) color2 = colors.HexColor( rgb2hex(m.to_rgba(float(identity)))) border2 = colors.HexColor( rgb2hex(m.to_rgba(float(identity)))) F_x = set_X.add_feature( SeqFeature( FeatureLocation(feature_1.location.start, feature_1.location.end, strand=0)), color=color, border=border, set_id=feature_1.qualifiers["locus_tag"]) F_y = set_Y.add_feature(SeqFeature( FeatureLocation(feature_2.location.start, feature_2.location.end, strand=0)), color=color, border=border) gd_diagram.cross_track_links.append( CrossLink(F_x, F_y, color2, border2)) #for x in range(0,len(region_record_list)-1): x = 0 all_locus = [] for n, record in enumerate(region_record_list): gd_feature_set = feature_sets[n] i = 0 if plasmid_list[x]: #print "PLASMID!!" color1 = colors.HexColor('#2837B7') color2 = colors.blue else: color1 = colors.HexColor('#40F13A') color2 = colors.HexColor('#0F600C') one_row_locus = [] for feature in record.features: if feature.type == "tblast_target": feature.name = 'match' gd_feature_set.add_feature(feature, sigil="BOX", color="#ff4a0c86", label=False, label_position="middle", label_size=25, label_angle=0) if feature.type == "assembly_gap": #print "gap", feature feature.location.strand = None gd_feature_set.add_feature(feature, sigil="BOX", color="red", label=True, label_position="middle", label_strand=1, label_size=14, label_angle=40) if feature.type == "rRNA": gd_feature_set.add_feature(feature, sigil="ARROW", color="orange", label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) try: one_row_locus.append(feature.qualifiers["locus_tag"][0]) except: pass if feature.type == "tRNA": gd_feature_set.add_feature(feature, sigil="ARROW", color="orange", label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) try: one_row_locus.append(feature.qualifiers["locus_tag"][0]) except: print('no locus tag for:') print(feature) if feature.type == "repeat_region": gd_feature_set.add_feature(feature, sigil="BOX", color="blue", label=True, label_position="middle", label_strand=1, label_size=14, label_angle=40) if 'pseudo' in feature.qualifiers: gd_feature_set.add_feature(feature, sigil="OCTO", color="#6E6E6E", label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) elif feature.type != "CDS": continue else: try: a = feature.qualifiers["locus_tag"][0] except: # cas des pseudogenes qui sont des CDS mais n'ont pas de protein ID continue try: g = feature.qualifiers["orthogroup"][0] except: # cas des pseudogenes qui sont des CDS mais n'ont pas de protein ID continue if a in color_locus_list: #print '###########################', a, color_locus_list if len(gd_feature_set) % 2 == 0: color = colors.HexColor('#ca4700') else: color = colors.HexColor('#fd7a32') else: if len(gd_feature_set) % 2 == 0: color = color1 else: color = color2 if g in color_orthogroup_list: #print '###########################', a, color_locus_list if len(gd_feature_set) % 2 == 0: color = colors.HexColor('#ca4700') else: color = colors.HexColor('#fd7a32') else: if len(gd_feature_set) % 2 == 0: color = color1 else: color = color2 #try: # try: # group = protein_id2group[feature.qualifiers["protein_id"][0]] # except: # group = protein_id2group[feature.qualifiers["protein_id"][1]] #except: # # no group attributed: singleton => special color # color = colors.HexColor('#E104C0') for target_protein in target_protein_list: if target_protein in feature.qualifiers["locus_tag"]: #print "target prot!" color = colors.red gd_feature_set.add_feature(feature, sigil="ARROW", color=color, label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) i += 1 try: one_row_locus.append(feature.qualifiers["locus_tag"][0]) except: print('no locus tag for:') print(feature) all_locus = one_row_locus + all_locus x += 1 #print "max", max_len #print "n record", len(region_record_list) if len(region_record_list) == 2: hauteur = 300 else: hauteur = 150 * len(region_record_list) largeur = max(record_length) / 30 #print "hauteur", hauteur #print "largeur", largeur #gd_diagram.set_page_size(, orientation) if hauteur > largeur: gd_diagram.draw(format="linear", pagesize=(hauteur, largeur), orientation='portrait', fragments=1, start=0, end=max_len) else: gd_diagram.draw(format="linear", pagesize=(hauteur, largeur), orientation='landscape', fragments=1, start=0, end=max_len) #print "writing diagram", out_name #gd_diagram.write(out_name, "SVG") import io from chlamdb.plots import edit_svg svg_diagram = io.StringIO() gd_diagram.write(svg_diagram, "SVG") svg_diagram.flush() #gd_diagram with_links = edit_svg.edit_svg(svg_diagram.getvalue(), all_locus, biodb_name) with_links.write(out_name) png_name = out_name.split('.')[0] + '.png' #png_handle = open(png_name, 'w') #gd_diagram.write(png_handle, "PNG") #png_handle.close() try: cmd = 'chmod 444 %s' % out_name except: pass from chlamdb.biosqldb import shell_command #print cmd shell_command.shell_command(cmd) return all_locus
def setup_blastdb(biodb, static_dir_path): from chlamdb.biosqldb import manipulate_biosqldb from chlamdb.biosqldb import gbk2fna from chlamdb.biosqldb import gbk2faa from chlamdb.biosqldb import gbk2ffn from chlamdb.biosqldb import gbk2table import os from Bio import SeqIO from chlamdb.biosqldb import shell_command server, db = manipulate_biosqldb.load_db(biodb) sql1 = 'select distinct accession from orthology_detail_%s' % biodb accession_list = [ i[0] for i in server.adaptor.execute_and_fetchall(sql1, ) ] db_static_path = os.path.join(static_dir_path, biodb) try: os.mkdir(db_static_path) except: pass faa_path = os.path.join(db_static_path, 'faa') print(faa_path) os.mkdir(faa_path) fna_path = os.path.join(db_static_path, 'fna') os.mkdir(fna_path) ffn_path = os.path.join(db_static_path, 'ffn') os.mkdir(ffn_path) gbk_path = os.path.join(db_static_path, 'gbk') os.mkdir(gbk_path) tab_path = os.path.join(db_static_path, 'tab') os.mkdir(tab_path) for n, accession in enumerate(accession_list): print(n, accession) record = db.lookup(accession=accession) # faa + merged out_name_faa = os.path.join(faa_path, accession + '.faa') out_name_ffn = os.path.join(ffn_path, accession + '.ffn') out_name_fna = os.path.join(fna_path, accession + '.fna') out_name_tab = os.path.join(tab_path, accession + '.tab') out_name_gbk = os.path.join(gbk_path, accession + '.gbk') gbk2faa.gbk2faa(record, lformat=True, outname=out_name_faa) # fna gbk2fna.gbk2fna(record, outname=out_name_fna) # ffn gbk2ffn.gbk2ffn(record, outname=out_name_ffn, locus_tag=True) # gbk with open(out_name_gbk, 'w') as f: SeqIO.write(record, f, 'genbank') # tab gbk2table.gbk2table(record, out_name_tab) # merging faa, fna and ffn shell_command.shell_command("cd %s; cat *faa> all.faa" % faa_path) shell_command.shell_command("cd %s; cat *ffn> all.ffn" % ffn_path) shell_command.shell_command("cd %s; cat *fna> all.fna" % fna_path) # formatdb # makeblastdb -in prot2003-2014_test.fa -dbtype prot shell_command.shell_command( "cd %s; for i in `ls *faa`;do makeblastdb -in $i -dbtype prot; done" % faa_path) shell_command.shell_command( "cd %s; for i in `ls *ffn`;do makeblastdb -in $i -dbtype nucl; done" % ffn_path) shell_command.shell_command( "cd %s; for i in `ls *fna`;do makeblastdb -in $i -dbtype nucl; done" % fna_path)
def map2highlighted_map(map_id, ko_list, ko2freq, biodb, outpath='test.pdf', taxon_id=False, n_species=60): import re from chlamdb.biosqldb import shell_command from Bio.Graphics.KGML_vis import KGMLCanvas from Bio.Graphics import KGML_vis import urllib.request from Bio.KEGG.KGML.KGML_pathway import Pathway, Reaction, Relation import Bio.KEGG.KGML.KGML_pathway from Bio.KEGG.KGML import KGML_parser from Bio.Graphics.ColorSpiral import ColorSpiral import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl values = [float(i) for i in ko2freq.values()] norm = mpl.colors.Normalize(vmin=0, vmax=n_species) cmap = cm.OrRd cmap2 = cm.Greens m = cm.ScalarMappable(norm=norm, cmap=cmap) m2 = cm.ScalarMappable(norm=norm, cmap=cmap2) url_template = 'http://rest.kegg.jp/get/%s/kgml' % re.sub( 'map', 'ko', map_id) print(url_template) f = urllib.request.urlopen(url_template) from Bio.Graphics import KGML_vis pathway = KGML_parser.read(f.read().decode('UTF-8')) kgml_map = KGMLCanvas(pathway, show_maps=True) # Let's use some arbitrary colours for the orthologs cs = ColorSpiral(a=2, b=0.2, v_init=0.85, v_final=0.5, jitter=0.03) # Loop over the orthologs in the pathway, and change the # background colour orthologs = [e for e in pathway.orthologs] for o in orthologs: match = False if 'K00163' in o.name: print('##################################') ko_temp_list = set([i.rstrip() for i in o.name.split('ko:')]) if len(ko_temp_list.intersection(set(ko2freq.keys()))) > 0: ko_keep = [] for ko in ko_temp_list: if ko in ko2freq: ko_keep.append(ko) if ko in ko_list: match = True o.name = 'ko:' + ' ko:'.join(ko_keep) total = sum([ int(ko2freq[i]) for i in ko_temp_list.intersection(set(ko2freq.keys())) ]) for g in o.graphics: if match: g.bgcolor = rgb2hex(m2.to_rgba(float(total))) else: #print 'no match!!!!' #print ko_temp_list #print ko2freq.keys() #print 'TOTAL:', total g.bgcolor = rgb2hex(m.to_rgba(float(total))) o.name = "%s (%s)" % (o.name.split('ko:')[0], total) #else: # for g in o.graphics: # g.bgcolor = '#FFFFFF' # Default settings are for the KGML elements only # We need to use the image map, and turn off the KGML elements, to see # only the .png base map. We could have set these values on canvas # instantiation kgml_map.import_imagemap = True kgml_map.show_maps = True kgml_map.show_orthologs = True kgml_map.draw_relations = False kgml_map.show_compounds = False kgml_map.show_genes = False kgml_map.show_compounds = False kgml_map.show_genes = False kgml_map.draw(outpath) ''' print 'DIRLISAT:', dir(pathway) maps = [m for m in pathway.maps] for map in maps: for g in map.graphics: print g.name ''' #print re.sub('pdf', 'svg', outpath) shell_command.shell_command( 'inkscape %s --export-plain-svg=%s' % (outpath, re.sub('pdf', 'svg', outpath))) # 'pdf2svg %s %s all' t = edit_svg_map("%s" % re.sub('pdf', 'svg', outpath), ko2freq.keys(), biodb, map_id, taxon_id=taxon_id) #print "%s" % re.sub('pdf', 'svg', outpath) t.write("%s" % re.sub('pdf', 'svg', outpath))
def run_hmmer(self, profiles=False): from tempfile import NamedTemporaryFile from chlamdb.biosqldb import shell_command if not profiles: profiles = self.hmm_profiles header = [ "profile_id", "profile_length", "best_hit_id", "bias", "bitscore", "evalue", "query_start", "query_end", "query_coverage", "hit_start", "hit_end" ] results = [] #[header] for profile in profiles: temp_file = NamedTemporaryFile() self.hmmer_output_list.append(temp_file.name) if not isinstance(self.database, list): cmd = self.hmmer_cmd % (temp_file.name, profile, self.database) #print cmd stout, sterr, code = shell_command.shell_command( cmd) # self.hmmer_score_cutoff, if code != 0: import sys sys.stdout.write("\n%s\n%s\n" % (stout, sterr)) sys.exit() parsed_data = self._parse_hmmsearch(temp_file.name) if len(parsed_data) == 0: print( 'No domains respecting score threshold for %s, continue...' % profile) continue if not isinstance(parsed_data[0], dict): results.append([ '%s' % parsed_data[0], '-', '-', '-', '-', '-', '-', '-', '-', '-' ]) else: hsp_list = parsed_data for x in range(0, len(hsp_list)): #results += '\t'.join([str(hsp_list[x][i]) for i in header]) #results += '\n' results.append([str(hsp_list[x][i]) for i in header]) else: # multiple databases: performing bitscore filtering self.biodb2best_hits = {} for database in self.database: stout, sterr, code = shell_command.shell_command( self.hmmer_cmd % (self.hmmer_score_cutoff, temp_file.name, profile, self.database)) if code != 0: import sys sys.stdout.write("\n%s\n%s\n" % (stout, sterr)) sys.exit() parsed_data = self._parse_hmmsearch(temp_file.name) ''' if not isinstance(parsed_data[0], dict): pass else: # all hsp have the same bitscore, only use the first hsp if parsed_data[0]['profile_id'] not in self.profile2scores: self.profile2scores[parsed_data[0]['profile_id']] = [parsed_data[0]['bitscore']] else: self.profile2scores[parsed_data[0]['profile_id']].append(parsed_data[0]['bitscore']) hsp_list = parsed_data for x in range(0,len(hsp_list)): results += '\t'.join([str(hsp_list[x][i]) for i in header]) results += '\n' ''' return results