else: start = int(sp_line[2]) end = int(sp_line[3]) strand = 1 if series in series_features: series_features[series].append((gene_name, strand, start, end)) else: series_features[series] = [(gene_name, strand, start, end)] series_indexes[series] = series_index series_index = series_index + 1 start = sys.maxsize end = -1 gdd = GenomeDiagram.Diagram("diagram", tracklines=False, y=0.4) gd_track_for_features = gdd.new_track(1, scale=True, height=1, scale_smallticks=0) gds_features = gd_track_for_features.new_set() seed(int(arg.random_seed)) for series in series_features.keys(): if series_indexes[series] < len(colors_list): current_color = colors_list[series_indexes[series]] else: current_color = colors.Color(random(), random(), random()) for i in range(0, len(series_features[series])):
''' first try, not using Diagram_class ''' from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.Graphics import GenomeDiagram from Bio.Graphics.GenomeDiagram import CrossLink from reportlab.lib.units import cm from reportlab.lib import colors gdd = GenomeDiagram.Diagram('Diagram') gdt1_features = gdd.new_track(1, greytrack=False) gds1_features = gdt1_features.new_set() gdt2_features = gdd.new_track(1, greytrack=False) gds2_features = gdt2_features.new_set() inFile = open('ABL1_NC') NC_len = 5894 num = 0 startend = [] for line in inFile: num += 1 color = colors.linearlyInterpolatedColor(colors.white, colors.firebrick, 0, 10, num) line = line.strip() fields = line.split('\t') q_start = int(fields[7]) q_end = int(fields[8]) s_start = int(fields[9]) s_end = int(fields[10])
#!/home/pjsola/env/bin/python import os import csv from Bio.SeqFeature import SeqFeature, FeatureLocation from reportlab.lib import colors from reportlab.lib.units import cm from Bio.Graphics import GenomeDiagram from Bio.Graphics.GenomeDiagram import CrossLink diagram_name = 'TEST_CL' gd_diagram = GenomeDiagram.Diagram(diagram_name) dict_records = {'NC_016838':122799, 'NC_016839':105974, 'NC_016846':111195} #NC_016838.1 vs NC_016839.1 made up reltions A_vs_B = [ (99, "mcpQ", "tetR"), (33, "ligA", "rhsC") ] B_vs_C = [ (99, "tetA", "pld"), (33, "rhsC", "traC") ] i = 0 for record,record_length in dict_records.items(): # Allocate tracks 5 (top), 3, 1 (bottom) for A, B, C # (empty tracks 2 and 4 add useful white space to emphasise the cross links # and also serve to make the tracks vertically more compressed)
def plot_multiple_regions_crosslink(target_protein_list, region_record_list, plasmid_list, out_name, biodb_name="chlamydia_03_15", color_locus_list=[], flip_record_based_on_first=True, color_orthogroup_list=[]): import matplotlib.cm as cm from matplotlib.colors import rgb2hex import matplotlib as mpl import MySQLdb import os sqlpsw = os.environ['SQLPSW'] norm = mpl.colors.Normalize(vmin=-30, vmax=100) cmap = cm.Blues m = cm.ScalarMappable(norm=norm, cmap=cmap) conn = MySQLdb.connect( host="127.0.0.1", # your host, usually localhost user="******", # your username passwd=sqlpsw, # your password db="orth_%s" % biodb_name) # name of the data base cursor = conn.cursor() gd_diagram = GenomeDiagram.Diagram("geomic_region") feature_sets = [] max_len = 0 records = dict((rec.name, rec) for rec in region_record_list) n_records = len(region_record_list) record_length = [len(record) for record in region_record_list] if flip_record_based_on_first: region_record_list_flip = [region_record_list[0]] region_record_list_flip[0].name = region_record_list_flip[ 0].description for x in range(0, len(region_record_list) - 1): same_strand_count = 0 different_strand_count = 0 features_X = region_record_list[x].features features_Y = region_record_list[x + 1].features for feature_1 in features_X: if feature_1.type != "CDS": continue for feature_2 in features_Y: if feature_2.type != "CDS": continue try: group1 = feature_1.qualifiers["orthogroup"][0] group2 = feature_2.qualifiers["orthogroup"][0] if group1 == group2: strand1 = feature_1.location.strand strand2 = feature_2.location.strand if strand1 == strand2: same_strand_count += 1 else: different_strand_count += 1 except: pass if different_strand_count > same_strand_count: region_record_list[x + 1] = region_record_list[ x + 1].reverse_complement( id=region_record_list[x + 1].id, name=region_record_list[x + 1].description) else: region_record_list[x + 1].name = region_record_list[x + 1].description #region_record_list = region_record_list_flip for i, record in enumerate(region_record_list): max_len = max(max_len, len(record)) #Allocate tracks 3 (top), 1 (bottom) for region 1 and 2 #(empty tracks 2 useful white space to emphasise the cross links #and also serve to make the tracks vertically more compressed) gd_track_for_features = gd_diagram.new_track( (1 * n_records - 1) - 1 * i, name=record.name, greytrack=True, height=0.4, start=0, end=len(record)) if record.name not in feature_sets: feature_sets.append(gd_track_for_features.new_set()) else: print("already in feature_sets!") print(record) quit #print 'looping....' for x in range(0, len(region_record_list) - 1): features_X = region_record_list[x].features features_Y = region_record_list[x + 1].features set_X = feature_sets[x] set_Y = feature_sets[x + 1] for feature_1 in features_X: if feature_1.type != "CDS": continue for feature_2 in features_Y: if feature_2.type != "CDS": continue try: group1 = feature_1.qualifiers["orthogroup"][0] group2 = feature_2.qualifiers["orthogroup"][0] except: group1 = "one_singleton" group2 = "two_singleton" if group1 == group2: border = colors.lightgrey color = colors.lightgrey try: identity = orthogroup_identity_db.check_identity( cursor, feature_1.qualifiers["orthogroup"][0], feature_1.qualifiers["locus_tag"][0], feature_2.qualifiers["locus_tag"][0]) except: identity = 0 print( "problem with identity table %s and locus %s %s" % (group1, feature_1.qualifiers["locus_tag"][0], feature_1.qualifiers["locus_tag"][0])) color2 = colors.HexColor( rgb2hex(m.to_rgba(float(identity)))) border2 = colors.HexColor( rgb2hex(m.to_rgba(float(identity)))) F_x = set_X.add_feature( SeqFeature( FeatureLocation(feature_1.location.start, feature_1.location.end, strand=0)), color=color, border=border, set_id=feature_1.qualifiers["locus_tag"]) F_y = set_Y.add_feature(SeqFeature( FeatureLocation(feature_2.location.start, feature_2.location.end, strand=0)), color=color, border=border) gd_diagram.cross_track_links.append( CrossLink(F_x, F_y, color2, border2)) #for x in range(0,len(region_record_list)-1): x = 0 all_locus = [] for n, record in enumerate(region_record_list): gd_feature_set = feature_sets[n] i = 0 if plasmid_list[x]: #print "PLASMID!!" color1 = colors.HexColor('#2837B7') color2 = colors.blue else: color1 = colors.HexColor('#40F13A') color2 = colors.HexColor('#0F600C') one_row_locus = [] for feature in record.features: if feature.type == "tblast_target": feature.name = 'match' gd_feature_set.add_feature(feature, sigil="BOX", color="#ff4a0c86", label=False, label_position="middle", label_size=25, label_angle=0) if feature.type == "assembly_gap": #print "gap", feature feature.location.strand = None gd_feature_set.add_feature(feature, sigil="BOX", color="red", label=True, label_position="middle", label_strand=1, label_size=14, label_angle=40) if feature.type == "rRNA": gd_feature_set.add_feature(feature, sigil="ARROW", color="orange", label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) try: one_row_locus.append(feature.qualifiers["locus_tag"][0]) except: pass if feature.type == "tRNA": gd_feature_set.add_feature(feature, sigil="ARROW", color="orange", label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) try: one_row_locus.append(feature.qualifiers["locus_tag"][0]) except: print('no locus tag for:') print(feature) if feature.type == "repeat_region": gd_feature_set.add_feature(feature, sigil="BOX", color="blue", label=True, label_position="middle", label_strand=1, label_size=14, label_angle=40) if 'pseudo' in feature.qualifiers: gd_feature_set.add_feature(feature, sigil="OCTO", color="#6E6E6E", label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) elif feature.type != "CDS": continue else: try: a = feature.qualifiers["locus_tag"][0] except: # cas des pseudogenes qui sont des CDS mais n'ont pas de protein ID continue try: g = feature.qualifiers["orthogroup"][0] except: # cas des pseudogenes qui sont des CDS mais n'ont pas de protein ID continue if a in color_locus_list: #print '###########################', a, color_locus_list if len(gd_feature_set) % 2 == 0: color = colors.HexColor('#ca4700') else: color = colors.HexColor('#fd7a32') else: if len(gd_feature_set) % 2 == 0: color = color1 else: color = color2 if g in color_orthogroup_list: #print '###########################', a, color_locus_list if len(gd_feature_set) % 2 == 0: color = colors.HexColor('#ca4700') else: color = colors.HexColor('#fd7a32') else: if len(gd_feature_set) % 2 == 0: color = color1 else: color = color2 #try: # try: # group = protein_id2group[feature.qualifiers["protein_id"][0]] # except: # group = protein_id2group[feature.qualifiers["protein_id"][1]] #except: # # no group attributed: singleton => special color # color = colors.HexColor('#E104C0') for target_protein in target_protein_list: if target_protein in feature.qualifiers["locus_tag"]: #print "target prot!" color = colors.red gd_feature_set.add_feature(feature, sigil="ARROW", color=color, label=True, label_position="middle", label_strand=1, label_size=10, label_angle=40) i += 1 try: one_row_locus.append(feature.qualifiers["locus_tag"][0]) except: print('no locus tag for:') print(feature) all_locus = one_row_locus + all_locus x += 1 #print "max", max_len #print "n record", len(region_record_list) if len(region_record_list) == 2: hauteur = 300 else: hauteur = 150 * len(region_record_list) largeur = max(record_length) / 30 #print "hauteur", hauteur #print "largeur", largeur #gd_diagram.set_page_size(, orientation) if hauteur > largeur: gd_diagram.draw(format="linear", pagesize=(hauteur, largeur), orientation='portrait', fragments=1, start=0, end=max_len) else: gd_diagram.draw(format="linear", pagesize=(hauteur, largeur), orientation='landscape', fragments=1, start=0, end=max_len) #print "writing diagram", out_name #gd_diagram.write(out_name, "SVG") import io from chlamdb.plots import edit_svg svg_diagram = io.StringIO() gd_diagram.write(svg_diagram, "SVG") svg_diagram.flush() #gd_diagram with_links = edit_svg.edit_svg(svg_diagram.getvalue(), all_locus, biodb_name) with_links.write(out_name) png_name = out_name.split('.')[0] + '.png' #png_handle = open(png_name, 'w') #gd_diagram.write(png_handle, "PNG") #png_handle.close() try: cmd = 'chmod 444 %s' % out_name except: pass from chlamdb.biosqldb import shell_command #print cmd shell_command.shell_command(cmd) return all_locus
from reportlab.lib import colors from reportlab.lib.units import cm from Bio.Graphics import GenomeDiagram from Bio import SeqIO from Bio.SeqFeature import SeqFeature, FeatureLocation import sys record = next(SeqIO.parse(sys.argv[1], "genbank")) print(record) gd_diagram = GenomeDiagram.Diagram(record.id) gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") gd_feature_set = gd_track_for_features.new_set() contig = SeqFeature(FeatureLocation(0, len(record.seq))) gd_feature_set.add_feature(contig, sigil="ARROW", color="black", label=True, name="1", arrowshaft_height=1.0, label_size=14, label_angle=0) gd_feature_set.add_feature(contig, sigil="ARROW", color="black", label=True, name="1", arrowshaft_height=1.0, label_size=14, label_angle=0) for feature in record.features: if feature.type != "gene":
# import library to create genome diagram from Bio import SeqIO from Bio.Graphics import GenomeDiagram # import library to parse file from reportlab.lib import colors from reportlab.lib.units import cm # read genome file record = SeqIO.read("Genome.gb", "genbank") # make diagram for the genome gd_diagram = GenomeDiagram.Diagram("DNA sequence visualization") gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") gd_feature_set = gd_track_for_features.new_set() # add features to diagram for feature in record.features: if feature.type != "gene": continue if len(gd_feature_set) % 2 == 0: #add colors to diagram color = colors.green else: color = colors.darkcyan gd_feature_set.add_feature(feature, color=color, label=True, label_size=20, label_color=color)
line = line.split() rev_data.append((int(line[1]), float(line[2]))) if float(line[2]) > scale_height: scale_height = float(line[2]) rev_data.append((len(record.seq) + 5, scale_height)) print(filename) print("Max height: " + str(scale_height)) if "ICP1" in filename: scale_height = 10 elif "chromosome" in filename: scale_height = 1000 else: scale_height = 50000 gd_diagram = GenomeDiagram.Diagram("temp") gd_track_for_features = gd_diagram.new_track( 5, name="Annotated Features", height=1, scale_ticks=0, greytrack=1, greytrack_labels=0, scale=0) gd_feature_set = gd_track_for_features.new_set(type='feature') gd_track_for_feature_names = gd_diagram.new_track( 2, name="Annotated names", height=1, scale_ticks=1,
ret_list.append((i, x)) i = i + 100 return ret_list graphdata2 = gcSkewData(SeqUtils.GC_skew(seq_string)) gdgs2 = GenomeDiagram.GraphSet('GC Skew') gdgs2.new_graph(graphdata, 'GC Skew', style='line', linewidth=2) gdt3 = GenomeDiagram.Track('GC Skew', greytrack=1, greytrack_labels=4) gdt3.add_set(gdgs2) track_list.append(gdt3) gd_diagram = GenomeDiagram.Diagram("Tomato Curly Stunt Virus, complete genome", track_size=0.7) i = 1 for track in track_list: gd_diagram.add_track(track, i) i = i + 1 gd_diagram.draw(format="circular", circular=True, pagesize=(50 * cm, 50 * cm), start=0, end=len(record), circle_core=0.3) gd_diagram.write("circularGenomeTCSV.pdf", "PDF", dpi=72)
def record2graph(fnames, beds, r, minlog, window, verbose): """ """ #create diagram gdd = GenomeDiagram.Diagram() #GDDiagram(gb) #add annotation gdt1 = gdd.new_track(1, greytrack=1, name="Genes and GC", height=1.5, scale_smalltick_interval=5 * 10**4, scale_smallticks=0.15, scale_largeticks=1.0, scale_largetick_labels=1, scale_largetick_interval=250 * 10**3, scale_fontangle=0) gdt1.greytrack_fontcolor = colors.black gdfs = gdt1.new_set("feature") for feature in r.features: if feature.type == "CDS": gdfs.add_feature(feature, color=colors.grey) #add GC gdgs = gdt1.new_set("graph") #get gc graph gcgraph = seq2gcgraph(r, beds[0][0][0]) gdgs.new_graph(gcgraph, "GC content", style="line", color=colors.blue, center=50) #add coverage tracks for each bed tracks file ggraphs, gtracks = [], [] for i, fn in enumerate(fnames): #add individual track gdt = gdd.new_track(i + 2, greytrack=1, name=fn, height=1, scale_smalltick_interval=5 * 10**4, scale_smallticks=0.15, scale_largetick_labels=0, scale_fontangle=0) gdt.greytrack_fontcolor = colors.black #adjust font fsize = 8 - len(fnames) / 5 if fsize < 1: fsize = 1 gdt.greytrack_fontsize = fsize #add feature and graph gtracks.append(gdt.new_set("feature")) ggraphs.append(gdt.new_set("graph")) j = 0 #colorstuple = ("blue","darkgrey","orange") #first add coverage track i = 0 for bed, expcount, bed1, expcount1, bed2, expcount2 in zip( beds[0][0], beds[0][1], beds[1][0], beds[1][1], beds[2][0], beds[2][1]): #add SNPs colored data fdata = bed2SeqFeature(bed1, expcount1, bed2, expcount2, window) for feature, color in fdata: gtracks[i].add_feature(feature, colour=color) #add coverage track gdata = bed2graph(bed, window, expcount, minlog) gdgg = ggraphs[i].new_graph(gdata, fn, style="line", linewidth=0.5, center=0, color="blue") i += 1 #write xl = xr = 0.05 width = 841.8897637795275 height = 595.275590551181 '''if clen<10.0**6: xr = 1.0 - clen / 10.0**6 * 0.95 else: width = clen * width / 10.0**6 if len(fnames)>12: height = len(fnames)/12.0 * height''' #draw gdd.draw(format="linear", pagesize=(width, height), xl=xl, xr=xr, orientation="landscape", tracklines=0, fragments=1, circular=0, track_size=0.75) # ,pagesize="A3" return gdd
def cre(project_data_dir="", global_data_dir="", feature_name="", construct_name="CRE", extract_from="", main_record_file="", align_with=[], primer_list=[], restriction_interval=[], rb=[], write=False, pagesize="A4", scale_fontsize=3, label_size=2, greytrack_fontsize=7, x=0.05, y=0.01, track_size=0.3): #define color palettes: primer_colors = [ colors.orchid, colors.cornflower, colors.lightseagreen, colors.salmon ] if extract_from: main_record, main_record_file = extract_feature( sequence_id=extract_from, data_dir=global_data_dir, feature_names=feature_name, write_file=True) elif main_record_file: extract_from = splitext(basename(main_record_file))[0] if splitext(basename(main_record_file))[1] == ".fasta": main_record = SeqIO.read(main_record_file, "fasta") elif splitext( basename(main_record_file))[1] in [".gb", ".gbk", ".genbank"]: main_record = SeqIO.read(main_record_file, "gb") main_record_file = convert_seq(main_record_file, "genbank", "fasta") print main_record_file gdd = GenomeDiagram.Diagram(construct_name + ' Construct Diagram', x=x, y=y, track_size=track_size) genbank_track, genbank_features = new_track( gdd, construct_name + " features", smalltick=10, scale_fontsize=scale_fontsize, greytrack_fontsize=greytrack_fontsize) for feature in main_record.features: if "Cre" in str(feature.qualifiers): color = colors.lavender else: color = colors.grey genbank_features.add_feature(feature, sigil="ARROW", color=color, label_color=color, label=True, label_size=label_size, label_angle=30, arrowshaft_height=1) if restriction_interval or rb: restriction_dict = enzyme_selector(sequence=main_record, restriction_interval=[0, 690], genome_frequency=[700, 2000], deterministic_overhangs=True, rb=["XceI", "PsuI"]) restriction_track, restriction_features = new_track( gdd, construct_name + " restriction sites", smalltick=10, scale_fontsize=scale_fontsize, greytrack_fontsize=greytrack_fontsize) draw_digest(restriction_features, restriction_dict) # plotting primers if primer_list: primer_colors = cycle(primer_colors) primer_track, primer_features = new_track( gdd, construct_name + " primers", smalltick=10, scale_fontsize=scale_fontsize, greytrack_fontsize=greytrack_fontsize) for primer_entry in primer_list: primer_color = primer_colors.next() add_to_track(primer_features, global_data_dir + "primers/" + primer_entry[0] + ".fasta", main_record_file, annotation=primer_entry[0], feature_color=primer_color, label_angle=30, label_size=label_size) add_to_track(primer_features, global_data_dir + "primers/" + primer_entry[1] + ".fasta", main_record_file, annotation=primer_entry[1], feature_color=primer_color, label_angle=30, label_size=label_size) # turn entry names into actual file paths align_with = [project_data_dir + entry + ".fasta" for entry in align_with] for i in align_with: hit_track_back, hit_features_back = new_track( gdd, splitext(i)[0][-6:], smalltick=10, end=len(SeqIO.read(i, 'fasta')), scale_fontsize=scale_fontsize, greytrack_fontsize=greytrack_fontsize) add_to_track(hit_features_back, main_record_file, i, annotation=" " + construct_name, feature_color=colors.red, label_angle=30, forceone=True, label_size=label_size) hit_track, hit_features = new_track( gdd, construct_name + " alignment hits", smalltick=10, scale_fontsize=scale_fontsize, greytrack_fontsize=greytrack_fontsize) hsp_list = add_to_track(hit_features, i, main_record_file, annotation=" " + splitext(i)[0][-6:], feature_color=colors.red, label_angle=30, forceone=True, label_size=label_size) record = SeqIO.read(i, "fasta") #for loop only takes the first alignment, then breaks for hsp in hsp_list: truncated_record = SeqRecord( record.seq[hsp.query_end:], id="Region downstream of Cre in ePet-cre mice, i=" + i) write_seq(sequence_write_path=project_data_dir, record=truncated_record, ID=splitext(basename(i))[0] + "_3-unmatched") break if align_with: record = SeqRecord(main_record.seq + record.seq[hsp.query_end:], id="Cre and following bases in ePet-cre construct.") write_seq(sequence_write_path=project_data_dir, record=record, ID="cre-ff-current") gdd.draw(format="linear", pagesize=pagesize, fragments=1, start=0, end=len(main_record)) if write: gdd.write( "/home/chymera/src/AutoTransGeno/output/" + construct_name + "_from_" + extract_from + ".pdf", "PDF") print "/home/chymera/src/AutoTransGeno/output/" + construct_name + "_from_" + extract_from + ".pdf" return gdd
def geneclusterview(gene_cluster, cutoff): cursor = asp_con(path='192.38.13.196', user='******', pw='1234') #recA = SeqIO.read("GC_11_382281_6.gb", "gb") #recB = SeqIO.read("GC_28_10278_8.gb", "gb") #recC = SeqIO.read("GC_28_3654_7.gb", "gb") red = PCMYKColor(0, 100.0, 0.0, 0.0) blue = PCMYKColor(91.0, 43.0, 0.0, 0.0) black = PCMYKColor(0, 0, 20.0, 0) pal = fade(blue,[100,60,40,20,5,0]) # TODO better coloring! #records = [recA, recB, recC] # for testing # TODO if isfile records = seq_builder(gene_cluster, cutoff, save = False) # records = [recA, recB, recC] ########################################## # Doing some black magic on rank numbers # ########################################## hits = [] tagcol = [] #all_h_tags=[] rank_col = [] for i in records[1:]: # Leaving query cluster out # Exchange hardcoded query cluster later! tags = [] for n in i.features: if n.qualifiers['locus_tag'][2:] is None: print "cannot get rank for this protein" print n else: tags.append(n.qualifiers['locus_tag'][2:]) # Hod to put a 0 here before... don't know why it doesn't need that now... tags_formatted = "("+str(tags)[1:-1]+")" #all_h_tags.append(str(tags)[1:-1]) query ="SELECT h_seqkey,CAST(pident AS UNSIGNED) from t_antismash2blast_reduced where clust_id = '%s' and h_seqkey IN %s;" % (gene_cluster,tags_formatted) cursor.execute(query) result = list(cursor.fetchall()) query =" SELECT ta.q_seqkey, tx.h_seqkey from t_antismash2blast_reduced as ta left join (SELECT * from t_antismash2blast_reduced as tb where tb.clust_id = '%s' and tb.h_seqkey IN %s ) tx on ta.q_seqkey = tx.q_seqkey where ta.clust_id = '%s' and ta.h_seqkey = ta.q_seqkey; "% (gene_cluster, tags_formatted, gene_cluster) cursor.execute(query) ranks_raw = cursor.fetchall() ranks = {} rank_counter = 1 for line in ranks_raw: a,b = line if b in ranks: if b!='NULL': ranks[b].append(rank_counter) rank_counter+=1 else: if b!='NULL': ranks[b] = list() ranks[b].append(rank_counter) rank_counter+=1 rank_col.append(ranks) tag_pid = {} for line in result: a,b = line if a in tag_pid: tag_pid[a].append(int(b)) else: tag_pid[a] = list() tag_pid[a].append(int(b)) hits.append(tag_pid) tagcol.append(tags) #print rank_col #f_all_h_tags= ','.join(all_h_tags) #formatted = "('"+str(f_all_h_tags)[1:-1]+"')" #print tagcol #print '\n' #print hits if gene_cluster[-2]=='_': fix = range(int(gene_cluster[-1])+1) else: fix = range(int(gene_cluster[-2:])+1) print fix rankA=fix[1:] #HERE!!!! print rankA # Let's see if [[] for i in range(len(records))] works because some records from org 3 show empty features ranklist = [[] for i in range(len(records))] #print range(len(records)) cols=[[] for i in range(len(records))] counter= -1 for i in tagcol: counter+=1 for n in i: #print counter if n in rank_col[counter]: ranklist[counter].append(rank_col[counter][n][0]) else: ranklist[counter].append(0) if n in hits[counter]: calc = round(hits[counter][n][0]) if calc > 80: index=0 elif calc >60: index = 1 elif calc > 40: index = 2 elif calc > 20: index = 3 else: index = 4 cols[counter].append(pal[index]) else: cols[counter].append(black) #print ranklist if gene_cluster[-2]=='_': dummy_int = int(gene_cluster[-1])+1 else: dummy_int = int(gene_cluster[-2:])+1 colA=[blue]*dummy_int def set_color(): pass name = "/home/seth/300asp/Scripts/genecluster_plots/gc_view_%s" % gene_cluster gd_diagram = GenomeDiagram.Diagram(name) max_len = 0 #print type([rankA]+ranklist) #print [rankA]+ranklist for record, gene_colors, rank in zip(records, [colA]+cols, [rankA]+ranklist): max_len = max(max_len, len(record)) gd_track_for_features = gd_diagram.new_track(1, name=record.description + record.id, greytrack=True, greytrack_labels = 1, greytrack_font_rotation = 0, greytrack_font_colour = Color(1,0,0), greytrack_fontsize = 5, axis_labels = True, scale_smallticks = 0.6, start=0, end=len(record)) gd_feature_set = gd_track_for_features.new_set() i = 0 #print len(rank) #print len(record.features) #print rank for feature, single_rank in zip(record.features, rank): #if feature.type != "gene": # Doesn't work with my records because they don't have genes, only cds.... should work but doesn't.... #Exclude this feature # continue if feature.strand == -1: temp_angle = 180 else: temp_angle = 1 if single_rank == 0: temp_name = '' else: temp_name = str(single_rank) # str(feature.qualifiers['locus_tag'])+'M'+str(single_rank) gd_feature_set.add_feature(feature, sigil="BIGARROW", color=gene_colors[i], label=True, name = temp_name, # this is too much, maybe implement if other label is found: str(feature.qualifiers['locus_tag'])+'M'+str(single_rank), label_position="middle", label_size = 6, label_angle= temp_angle) i+=1 gd_diagram.draw(format="linear", pagesize='A4', fragments=1, orientation = 'portrait', track_size = 0.6, xl = 0.15, start=0, end=max_len) gd_diagram.write(name + ".pdf", "PDF") #gd_diagram.write(name + ".eps", "EPS") gd_diagram.write(name + ".svg", "SVG")
def plot_motif_sites(self, cluster_num, motif_num): """THIS NEEDS MORE WORK but has the beginnings of something... TODO: multiple motifs on same tracks, include ALL genes (i.e. in operons that were not included), do reverse-complement positioning correctly (based on gene strand), use MAST scan output (from b.tables['motif_annotations']) """ from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.Graphics import GenomeDiagram from reportlab.lib.units import cm from reportlab.lib import colors """To get this to work: download http://www.reportlab.com/ftp/fonts/pfbfer.zip and unzip it into /usr/lib/python2.7/dist-packages/reportlab/fonts/ """ motif_sites = self.get_motif_sites(cluster_num, motif_num) pv_range = np.max( -np.log10(motif_sites.pvalue.values) ) - 4 ## divide -log10(pval) by this to get alpha to use len_range = np.max(motif_sites.start.values) + 10 gdd = GenomeDiagram.Diagram('Motif sites: %d, %d' % (cluster_num, motif_num)) for i in range(motif_sites.shape[0]): gdt_features = gdd.new_track(1, start=0, end=len_range, greytrack=True, greytrack_labels=1, name=motif_sites.names.values[i], scale=True, greytrack_fontsize=4) gds_features = gdt_features.new_set() col = colors.red.clone() col.alpha = (-np.log10(motif_sites.pvalue.values[i]) - 4) / pv_range m_start = motif_sites.start.values[i] m_len = len(motif_sites.seq.values[i]) m_strand = motif_sites.reverse.values[i] if m_strand == 0: m_strand = -1 feature = SeqFeature(FeatureLocation(m_start, m_start + m_len - 1), strand=m_strand) gds_features.add_feature(feature, name=str(i + 1), label=False, color=col) gdd.draw(format='linear', pagesize=(15 * cm, motif_sites.shape[0] * cm / 2), fragments=1, start=0, end=len_range + 10) ##gdd.write("GD_labels_default.pdf", "pdf") ## looks like only output is to file, so do this: #output = cStringIO.StringIO() #gdd.write(output, 'png', dpi=300) #output.seek(0) output = gdd.write_to_string(output='png', dpi=300) output = cStringIO.StringIO(output) img = mpimg.imread(output) plt.axis('off') imgplot = plt.imshow(img, interpolation='bicubic') output.close() return gdd
def __init__(self): self.tracks = [] self.gdd = GenomeDiagram.Diagram('Diagram') #Is this name useful ?
max_len += SPACER + len(record) max_len -= SPACER if os.path.isfile(reference_genbank): reference_parser = SeqIO.parse(reference_genbank, "genbank") else: reference_parser = SeqIO.parse(reference_fasta, "fasta") if output_fasta: sys.stderr.write( "WARNING - Consider using order_assembly.py instead for FASTA output\n" ) fasta_handle = open(output_fasta, "w") fasta_saved_count = 0 fasta_short_dropped = 0 gd_diagram = GenomeDiagram.Diagram("Comparison") gd_track_for_features = gd_diagram.new_track(1, name="reference", greytrack=False, height=0.5, start=0, end=max_len) gd_feature_set = gd_track_for_features.new_set() # Add a dark grey background gd_feature_set.add_feature(SeqFeature(FeatureLocation(0, len(record))), sigil="BOX", color="grey", label=False), offset = 0 ref_offsets = dict()
def record2graph(fnames, beds, r, minlog, window, verbose): """ """ #create diagram gdd = GenomeDiagram.Diagram() #GDDiagram(gb) #gdd.name = r.id #add annotation gdt1 = gdd.new_track(1, greytrack=1, name="[%s] Genes and GC" % r.id, height=1.5, scale_smalltick_interval=5 * 10**4, scale_smallticks=0.15, scale_largeticks=1.0, scale_largetick_labels=1, scale_largetick_interval=250 * 10**3, scale_fontangle=0) gdt1.greytrack_fontcolor = colors.black gdfs = gdt1.new_set("feature") for feature in r.features: if feature.type == "CDS": gdfs.add_feature(feature, color=colors.grey) #add GC gdgs = gdt1.new_set("graph") #get gc graph gcgraph = seq2gcgraph(r, beds[0][0][0]) gdgs.new_graph(gcgraph, "GC content", style="line", color=colors.blue, center=50) #add coverage tracks for each bed tracks file gdgslist = [] for i, fn in enumerate(fnames): #add individual track gdt = gdd.new_track(i + 2, greytrack=1, name=fn, height=2, scale_smalltick_interval=5 * 10**4, scale_smallticks=0.15, scale_largetick_labels=0, scale_fontangle=0) gdt.greytrack_fontcolor = colors.black gdgslist.append(gdt.new_set("graph")) colorstuple = ["blue", "darkgrey", "orange"] colorstuple.reverse() beds.reverse() for j, bedexps in enumerate(beds): for i, (bed, expcount) in enumerate(zip(bedexps[0], bedexps[1])): if bed: gdata = bed2graph(bed, window, expcount, minlog) #add graph to track linewidth = 0.3 if j == 2: linewidth = 1.0 gdgg = gdgslist[i].new_graph(gdata, fn, style="line", linewidth=linewidth, center=0, color=colorstuple[j]) clen = gdgslist[i].range()[1] #write xl = xr = 0.05 width = 841.8897637795275 height = 595.275590551181 '''if clen<10.0**6: xr = 1.0 - clen / 10.0**6 * 0.95 else: width = clen * width / 10.0**6 if len(fnames)>12: height = len(fnames)/12.0 * height''' #draw gdd.draw(format="linear", pagesize=(width, height), xl=xl, xr=xr, orientation="landscape", tracklines=0, fragments=1, circular=0, track_size=0.75) # ,pagesize="A3" return gdd
#!/home/pjsola/env/bin/python import os import csv from Bio.SeqFeature import SeqFeature, FeatureLocation from reportlab.lib import colors from reportlab.lib.units import cm from Bio.Graphics import GenomeDiagram diagram_name = 'TEST_3' gdd = GenomeDiagram.Diagram(diagram_name) dict_records = { 'NC_016838.1': 122799, 'NC_016839.1': 105974, 'NC_016846.1': 111195 } for record, record_length in dict_records.items(): gd_track_for_features = gdd.new_track(1, name=record, greytrack=True, start=0, end=record_length) gd_set_features = gd_track_for_features.new_set() with open('KPN.gff.forward.coordinates', 'r') as bed_forward_file: bed_readed = csv.reader(bed_forward_file, delimiter="\t") #record = None
(28, "orf54", "lin2566"), ] def get_feature(features, id, tags=("locus_tag", "gene", "old_locus_tag")): """Search list of SeqFeature objects for an identifier under the given tags.""" for f in features: for key in tags: # tag may not be present in this feature for x in f.qualifiers.get(key, []): if x == id: return f raise KeyError(id) gd_diagram = GenomeDiagram.Diagram(name) feature_sets = {} max_len = 0 for i, record in enumerate([A_rec, B_rec, C_rec]): max_len = max(max_len, len(record)) # Allocate tracks 5 (top), 3, 1 (bottom) for A, B, C # (empty tracks 2 and 4 add useful white space to emphasise the cross links # and also serve to make the tracks vertically more compressed) gd_track_for_features = gd_diagram.new_track(5 - 2 * i, name=record.name, greytrack=True, height=0.5, start=0, end=len(record)) assert record.name not in feature_sets feature_sets[record.name] = gd_track_for_features.new_set()
def write_schemadelica_plot(self): """Write schemadelica plot as SVG and PDF.""" gd_diagram = GenomeDiagram.Diagram("Primer Scheme", track_size=0.15) primer_feature_set = GenomeDiagram.FeatureSet() # make the gc track window = 50 gc_set = GenomeDiagram.GraphSet("GC content") graphdata1 = self.apply_to_window(self.primary_ref.seq, window, self.calc_gc) gc_set.new_graph( graphdata1, "GC content", style="line", color=colors.violet, altcolor=colors.purple, ) gc_track = GenomeDiagram.Track("GC content", height=1.5, greytrack=0, scale_largetick_interval=1e3) gc_track.add_set(gc_set) # make the primer track for r in self.regions: region = str(r.region_num) strand = 1 if r.region_num % 2 else -1 fwd_feature = SeqFeature( FeatureLocation(r.left.start, r.left.end, strand=strand)) rev_feature = SeqFeature( FeatureLocation(r.right.end, r.right.start, strand=strand)) region_feature = SeqFeature( FeatureLocation(r.left.start, r.right.start, strand=strand)) primer_color = colors.red region_color = colors.palevioletred primer_feature_set.add_feature( region_feature, color=region_color, name=region, label=True, label_position="middle", label_angle=0 if strand == 1 else -180, ) primer_feature_set.add_feature(fwd_feature, color=primer_color, name=region) primer_feature_set.add_feature(rev_feature, color=primer_color, name=region) primer_track = GenomeDiagram.Track(name="Annotated Features", height=1) primer_track.add_set(primer_feature_set) gd_diagram.add_track(primer_track, 2) gd_diagram.add_track(gc_track, 1) rows = max(2, int(round(len(self.primary_ref) / 10000.0))) gd_diagram.draw( format="linear", pagesize=(300 * rows, 200 * rows), fragments=rows, start=0, end=len(self.primary_ref), ) pdf_filepath = self.outpath / f"{self.prefix}.plot.pdf" svg_filepath = self.outpath / f"{self.prefix}.plot.svg" logger.info(f"Writing {pdf_filepath}") logger.info(f"Writing {svg_filepath}") gd_diagram.write(str(pdf_filepath), "PDF", dpi=300) gd_diagram.write(str(svg_filepath), "SVG", dpi=300)
# from a specified genbank file input from reportlab.lib import colors from reportlab.lib.units import cm from Bio.Graphics import GenomeDiagram from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio import SeqIO record1 = input("What is your genbank filename? Ex: ""sequence.gb"" \n") record = SeqIO.read(record1, "genbank") pgene_start = int(input("What is the start location of your putative gene?\n")) pgene_end = int(input("What is the end location of your putative gene?\n")) pgene_ori = int(input("Is putative gene forward (""1"") or reverse (""-1"")?\n")) pgene = str(input("What would you like to name your putative gene?\n")) # create an empty diagram, then add an empty track & empty feature set gd_diagram = GenomeDiagram.Diagram("S. cerevisiae Chromosome IX") gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") gd_feature_set = gd_track_for_features.new_set() #Take each gene SeqFeature object in our SeqRecord, and use it to # generate a feature on the diagram. for feature in record.features: if feature.type != "gene": #Exclude this feature continue if len(gd_feature_set) % 2 == 0: color = colors.lightblue else: color = colors.blue gd_feature_set.add_feature(feature, sigil="ARROW", color=color, arrowshaft_height=1.0, label=True, label_size=8, label_angle=30)
seqCount = 0 longest = 0 for record in SeqIO.parse(handle, "fasta"): seqId = record.id seqLen = len(record) seqLenMap[seqId] = seqLen longest = max(longest, seqLen) seqCount += 1 handle.close() colorMap = [color_code(i, len(motifMap))for i in range(len(motifMap))] seqColor = colors.grey.clone(alpha=0.2) posBED_FH = open(os.path.join(outDir, "sequences_cluster_match_position.bed"), "r") gdd = GenomeDiagram.Diagram() prevSeqId = "" trackId = 0 for line in posBED_FH: f = line.rstrip('\n').split('\t') seqId = f[0] seqLen = int(seqLenMap[seqId]) padLen = max(0, int((longest - seqLen) / 2)) start = int(f[1]) + padLen end = int(f[2]) + padLen cluster_id = int(f[3]) if prevSeqId != seqId: gd_track = gdd.new_track(2 * trackId, greytrack=True, start=0,
# to parse the data from Bio import SeqIO from Bio.Graphics import GenomeDiagram #to present the data from reportlab.lib import colors from reportlab.lib.units import cm color_set = [ colors.green, colors.orange, colors.red, colors.purple, colors.cyan ] record = SeqIO.read("Genome.gb", "genbank") gd_diagram = GenomeDiagram.Diagram("Tomato Curly Stunt Virus") gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") gd_feature_set = gd_track_for_features.new_set() for feature in record.features: if feature.type != "gene": # dont consider the feature since not a gene. continue color = color_set[(len(gd_feature_set))] # the parameters for representation of each feature. gd_feature_set.add_feature(feature, sigil="ARROW", arrowshaft_height=0.5, color=color, label=True, label_size=25,
def hyperdraw(genome, dataset, label, intensity=False): for genoslice in genome: step = 50000 print(len(genoslice), ' in ', step, ' is ', len(genoslice) / step) gd_diagram = GenomeDiagram.Diagram(genoslice.id) gd_track_for_features = gd_diagram.new_track( 1, name="Annotated Features", scale_ticks=1, scale_largetick_interval=1000, scale_smalltick_interval=100, scale_smallticks=0.05, scale_largeticks=0.2, scale_smalltick_labels=0) gd_feature_set = gd_track_for_features.new_set() for feature in genoslice.features: #if feature.type == "operon": # gd_feature_set.add_feature(feature, sigil="BOX", ##pointy boxes # color=colors.grey, label=False) if feature.type == "rRNA" or feature.type == "tRNA": gd_feature_set.add_feature( feature, sigil="OCTO", ##pointy boxes color=colors.grey) elif feature.type == "CDS": # feature.qualifiers['product'][0].lower().find('hypo')>-1: # feature.qualifiers['product'][0].lower().find('transposase')>-1: locus = feature.qualifiers['locus_tag'][0] red, green, blue = (1, 1, 1) border = colors.gainsboro ah = 0 if locus in dataset: if intensity: intense = float(dataset[locus][1]) red, green = (1 - intense / 10, 1 - intense / 10) else: ah = dataset[locus][3] if ah < 0: ah = 0 elif ah > 1: ah = 1 change = float(dataset[locus][1]) sig = float(dataset[locus][2]) if change > 0: red, blue = (1 - change / 10, 1 - change / 10) else: green, blue = (1 - abs(change / 10), 1 - abs(change / 10)) if sig < 0.05: border = colors.black else: border = colors.gray labellable = True else: labellable = False if len(feature) < 200: labellable = False color = colors.Color(red, green, blue) #shortened=feature.qualifiers['locus_tag'][0]+' '+feature.qualifiers['product'][0] #if len(shortened)>20: # shortened=shortened[0:20]+'...' shortened = feature.qualifiers['product'][0] for badword in 'hypothetical putative probable family domain unknown possible partial'.split( ): if feature.qualifiers['product'][0].lower().find( badword) > -1: shortened = feature.qualifiers['locus_tag'][0] allocated = math.floor(len(feature) / 1000 * 30) if len(shortened) > allocated: shortened = shortened[0:allocated] + '...' gd_feature_set.add_feature( feature, sigil="ARROW", arrowshaft_height=ah, ##pointy boxes=1 color=color, label=labellable, height=0.8, #not the actually setting... name=shortened, label_position="start", border=border, #label_strand=1, label_size=8, label_angle=0) i = 0 for x in range(step + 1, len(genoslice) - 100, step): print(x - step, x + 100) gs = genoslice[x - step:x + 100] gd_diagram.draw(format="linear", pagesize='A4', orientation='landscape', fragments=10, start=x - step, end=x + 100) print(gs.features[1].qualifiers['locus_tag'][0], gs.features[-1].qualifiers['locus_tag'][0]) op = 'C:\\Users\\Cass\\Desktop\\test2\\' + label + gs.features[ 1].qualifiers['locus_tag'][0] + '-' + gs.features[ -1].qualifiers['locus_tag'][0] gd_diagram.write(op + ".pdf", "PDF") gd_diagram.write(op + ".eps", "EPS") gd_diagram.write(op + ".svg", "SVG") gd_diagram.write(op + ".png", "PNG")
def plot_multiple_regions_crosslink2(target_protein_list, region_record_list, plasmid_list, out_name): gd_diagram = GenomeDiagram.Diagram("geomic_region") feature_sets = [] max_len = 0 records = dict((rec.name, rec) for rec in region_record_list) n_records = len(region_record_list) record_length = [len(record) for record in region_record_list] for i, record in enumerate(region_record_list): max_len = max(max_len, len(record)) #print "i", i #Allocate tracks 3 (top), 1 (bottom) for region 1 and 2 #(empty tracks 2 useful white space to emphasise the cross links #and also serve to make the tracks vertically more compressed) gd_track_for_features = gd_diagram.new_track( (2 * n_records - 1) - 2 * i, name=record.name, greytrack=True, height=0.5, start=0, end=len(record)) if record.name not in feature_sets: feature_sets.append(gd_track_for_features.new_set()) else: print("already in feature_sets!") print(record) quit for x in range(0, len(region_record_list) - 1): #print "x", x features_X = region_record_list[x].features features_Y = region_record_list[x + 1].features set_X = feature_sets[x] set_Y = feature_sets[x + 1] for feature_1 in features_X: if feature_1.type != "CDS": continue for feature_2 in features_Y: if feature_2.type != "CDS": continue try: group1 = feature_1.qualifiers["orthogroup"][0] group2 = feature_2.qualifiers["orthogroup"][0] except: group1 = "one_singleton" group2 = "two_singleton" if group1 == group2: border = colors.lightgrey color = colors.lightgrey F_x = set_X.add_feature(SeqFeature( FeatureLocation(feature_1.location.start, feature_1.location.end, strand=0)), color=color, border=border) F_y = set_Y.add_feature(SeqFeature( FeatureLocation(feature_2.location.start, feature_2.location.end, strand=0)), color=color, border=border) gd_diagram.cross_track_links.append( CrossLink(F_x, F_y, color, border)) #for x in range(0,len(region_record_list)-1): x = 0 for n, record in enumerate(region_record_list): gd_feature_set = feature_sets[n] i = 0 if plasmid_list[x]: #print "PLASMID!!!" color1 = colors.HexColor('#2837B7') color2 = colors.blue else: color1 = colors.HexColor('#40F13A') color2 = colors.HexColor('#0F600C') for feature in record.features: if feature.type != "CDS": continue try: a = feature.qualifiers["locus_tag"] except: # cas des pseudogenes qui sont des CDS mais n'ont pas de protein ID continue if len(gd_feature_set) % 2 == 0: color = color1 else: color = color2 #try: # try: # group = protein_id2group[feature.qualifiers["protein_id"][0]] # except: # group = protein_id2group[feature.qualifiers["protein_id"][1]] #except: # # no group attributed: singleton => special color # color = colors.HexColor('#E104C0') for target_protein in target_protein_list: if target_protein in feature.qualifiers["locus_tag"]: #print "target prot!" color = colors.red gd_feature_set.add_feature(feature, sigil="ARROW", color=color, label=True, label_position="middle", label_strand=1, label_size=12, label_angle=45) i += 1 x += 1 #print "max", max_len #print "n records", len(region_record_list) if len(region_record_list) == 2: hauteur = 700 else: hauteur = 250 * len(region_record_list) largeur = max(record_length) / 30 #print "hauteur", hauteur #print "largeur", largeur #gd_diagram.set_page_size(, orientation) if hauteur > largeur: gd_diagram.draw(format="linear", pagesize=(hauteur, largeur), orientation='portrait', fragments=1, start=0, end=max_len) else: gd_diagram.draw(format="linear", pagesize=(hauteur, largeur), orientation='landscape', fragments=1, start=0, end=max_len) #print "writing diagram", out_name gd_diagram.write(out_name, "SVG")
def run_module(self): if self.id_list and os.access(self.id_list[0], os.R_OK): print("Detected supplied circRNA ID file.") with open(self.id_list[0]) as f: lines = f.read().splitlines() self.id_list = lines # let's first check if the temporary directory exists if not (os.access(self.temp_dir, os.W_OK)): print("Temporary directory %s not writable." % self.temp_dir) # exit with -1 error if we can't use it exit(-1) # let's first check if the temporary directory exists if not (os.access(self.output_dir, os.W_OK)): print("Output directory %s not writable." % self.output_dir) # exit with -1 error if we can't use it exit(-1) circ_rna_number = 0 # define temporary files exon_storage_tmp = self.temp_dir + "circtools_flanking_exons.tmp" blast_storage_tmp = self.temp_dir + "circtools_blast_results.tmp" blast_xml_tmp = self.temp_dir + "circtools_blast_results.xml" output_html_file = self.output_dir + self.experiment_title.replace( " ", "_") + ".html" # erase old contents open(exon_storage_tmp, 'w').close() # define cache dicts exon_cache = {} flanking_exon_cache = {} primer_to_circ_cache = {} if self.input_circRNA: from Bio import SeqIO with open(exon_storage_tmp, 'a') as data_store: for record in SeqIO.parse(self.input_circRNA, "fasta"): # from the FASTA file we cannot tell the coordinates of the circRNA name = str(record.id) + "_0_0_" + str(len( record.seq)) + "_0" data_store.write("\t".join( [name, str(record.seq), "", "\n"])) exon_cache[name] = {1: str(record.seq), 2: ""} else: exons = self.read_annotation_file(self.gtf_file, entity="exon") with open(self.dcc_file) as fp: for line in fp: # make sure we remove the header if line.startswith('Chr\t'): continue line = line.rstrip() current_line = line.split('\t') if current_line[3] == "not_annotated": continue if self.gene_list and not self.id_list and current_line[ 3] not in self.gene_list: continue sep = "_" name = sep.join([ current_line[3], current_line[0], current_line[1], current_line[2], current_line[5] ]) if self.id_list and not self.gene_list and name not in self.id_list: continue flanking_exon_cache[name] = {} sep = "\t" bed_string = sep.join([ current_line[0], current_line[1], current_line[2], current_line[3], str(0), current_line[5] ]) virtual_bed_file = pybedtools.BedTool(bed_string, from_string=True) result = exons.intersect(virtual_bed_file, s=True) fasta_bed_line_start = "" fasta_bed_line_stop = "" start = 0 stop = 0 for result_line in str(result).splitlines(): bed_feature = result_line.split('\t') # this is a single-exon circRNA if bed_feature[1] == current_line[1] and bed_feature[ 2] == current_line[2]: fasta_bed_line_start += result_line + "\n" start = 1 stop = 1 if bed_feature[1] == current_line[1] and start == 0: fasta_bed_line_start += result_line + "\n" start = 1 if bed_feature[2] == current_line[2] and stop == 0: fasta_bed_line_stop += result_line + "\n" stop = 1 # these exons are kept for correctly drawing the circRNAs later # not used for primer design if bed_feature[1] > current_line[1] and bed_feature[ 2] < current_line[2]: flanking_exon_cache[name][bed_feature[1] + "_" + bed_feature[2]] = 1 virtual_bed_file_start = pybedtools.BedTool( fasta_bed_line_start, from_string=True) virtual_bed_file_stop = pybedtools.BedTool( fasta_bed_line_stop, from_string=True) virtual_bed_file_start = virtual_bed_file_start.sequence( fi=self.fasta_file) virtual_bed_file_stop = virtual_bed_file_stop.sequence( fi=self.fasta_file) if stop == 0 or start == 0: print( "Could not identify the exact exon-border of the circRNA." ) print( "Will continue with non-annotated, manually extracted sequence." ) # we have to manually reset the start position fasta_bed_line = "\t".join([ current_line[0], current_line[1], current_line[2], current_line[5] ]) virtual_bed_file_start = pybedtools.BedTool( fasta_bed_line, from_string=True) virtual_bed_file_start = virtual_bed_file_start.sequence( fi=self.fasta_file) virtual_bed_file_stop = "" exon1 = "" exon2 = "" if virtual_bed_file_start: exon1 = open( virtual_bed_file_start.seqfn).read().split( "\n", 1)[1].rstrip() if virtual_bed_file_stop: exon2 = open(virtual_bed_file_stop.seqfn).read().split( "\n", 1)[1].rstrip() circ_rna_number += 1 print("extracting flanking exons for circRNA #", circ_rna_number, name, end="\n", flush=True) if exon2 and not exon1: exon1 = exon2 exon2 = "" exon_cache[name] = {1: exon1, 2: exon2} with open(exon_storage_tmp, 'a') as data_store: data_store.write("\t".join([name, exon1, exon2, "\n"])) if not exon_cache: print( "Could not find any circRNAs matching your criteria, exiting.") exit(-1) # need to define path top R wrapper primer_script = 'circtools_primex_wrapper.R' # ------------------------------------ run script and check output ----------------------- script_result = os.popen(primer_script + " " + exon_storage_tmp + " " + str(self.product_range[0]) + "," + str(self.product_range[1]) + " " + self.junction).read() # this is the first time we look through the input file # we collect the primer sequences and unify everything in one blast query blast_object_cache = {} blast_result_cache = {} blast_input_file = "" if circ_rna_number < 50: for line in script_result.splitlines(): entry = line.split('\t') circular_rna_id = entry[0].split('_') if entry[1] == "NA": continue # only blast 1 elif entry[2] in blast_object_cache and not entry[ 1] in blast_object_cache: blast_input_file += "\n>" + entry[1] + "\n" + entry[1] blast_object_cache[entry[1]] = 1 primer_to_circ_cache[entry[1]] = circular_rna_id[0] # only blast 2 elif entry[1] in blast_object_cache and not entry[ 2] in blast_object_cache: blast_input_file += "\n>" + entry[2] + "\n" + entry[2] blast_object_cache[entry[2]] = 1 primer_to_circ_cache[entry[2]] = circular_rna_id[0] # seen both already, skip elif entry[1] in blast_object_cache and entry[ 2] in blast_object_cache: continue # nothing seen yet, blast both else: blast_input_file += "\n>" + entry[1] + "\n" + entry[ 1] + "\n>" + entry[2] + "\n" + entry[2] blast_object_cache[entry[1]] = 1 blast_object_cache[entry[2]] = 1 primer_to_circ_cache[entry[1]] = circular_rna_id[0] primer_to_circ_cache[entry[2]] = circular_rna_id[0] else: print("Too many circRNAs selected, skipping BLAST step.") if self.no_blast: print("User disabled BLAST search, skipping.") run_blast = 0 # check if we have to blast if not self.no_blast and blast_input_file: try: print("Sending " + str(len(blast_object_cache)) + " primers to BLAST") print("This may take a few minutes, please be patient.") result_handle = self.call_blast(blast_input_file, self.organism) run_blast = 1 except Exception as exc: print(exc) exit(-1) with open(blast_xml_tmp, "w") as out_handle: out_handle.write(result_handle.read()) result_handle.close() result_handle = open(blast_xml_tmp) blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: if blast_record.query not in blast_result_cache: blast_result_cache[blast_record.query] = [] for description in blast_record.descriptions: # filter out the host gene we're in now # also filter out all "PREDICTED" stuff if description.title.find(primer_to_circ_cache[blast_record.query]) == -1 and\ description.title.find("PREDICTED") == -1: blast_result_cache[blast_record.query].append( description.title) # if we encounter NAs nothing has been blasted, we manually set the values now blast_result_cache["NA"] = ["Not blasted, no primer pair found"] primex_data_with_blast_results = "" for line in script_result.splitlines(): entry = line.split('\t') # split up the identifier for final plotting line = line.replace("_", "\t") if run_blast == 1: left_result = "No hits" right_result = "No hits" else: left_result = "Not blasted, no primer pair found" right_result = left_result if entry[1] in blast_result_cache: left_result = ";".join(blast_result_cache[entry[1]]) if entry[2] in blast_result_cache: right_result = ";".join(blast_result_cache[entry[2]]) # update line primex_data_with_blast_results += line + "\t" + left_result + "\t" + right_result + "\n" with open(blast_storage_tmp, 'w') as data_store: data_store.write(primex_data_with_blast_results) # need to define path top R wrapper primer_script = 'circtools_primex_formatter.R' # ------------------------------------ run script and check output ----------------------- primex_data_formatted = os.popen(primer_script + " " + blast_storage_tmp + " " + "\"" + self.experiment_title + "\"").read() with open(output_html_file, 'w') as data_store: data_store.write(primex_data_formatted) print("Writing results to " + output_html_file) # here we create the circular graphics for primer visualisation for line in primex_data_with_blast_results.splitlines(): entry = line.split('\t') # no primers, no graphics if entry[6] == "NA": continue circular_rna_id = "_".join( [entry[0], entry[1], entry[2], entry[3], entry[4]]) if circular_rna_id in exon_cache: circular_rna_id_isoform = circular_rna_id + "_" + entry[5] circrna_length = int(entry[3]) - int(entry[2]) exon1_length = len(exon_cache[circular_rna_id][1]) exon2_length = len(exon_cache[circular_rna_id][2]) exon2_colour = "#ffac68" if exon2_length == 0: exon1_length = int( len(exon_cache[circular_rna_id][1]) / 2) + 1 exon2_length = int(len(exon_cache[circular_rna_id][1]) / 2) exon2_colour = "#ff6877" forward_primer_start = int( entry[8].split(',')[0]) + circrna_length - exon2_length forward_primer_length = int(entry[8].split(',')[1]) reverse_primer_start = int( entry[9].split(',')[0]) - exon2_length reverse_primer_length = int(entry[9].split(',')[1]) product_size = entry[14] gdd = GenomeDiagram.Diagram('circRNA primer diagram') gdt_features = gdd.new_track( 1, greytrack=True, name="", ) gds_features = gdt_features.new_set() feature = SeqFeature(FeatureLocation(0, exon1_length), strand=+1) gds_features.add_feature(feature, name="Exon 1", label=False, color="#ff6877", label_size=22) feature = SeqFeature(FeatureLocation( circrna_length - exon2_length, circrna_length), strand=+1) gds_features.add_feature(feature, name="Exon 2", label=False, color=exon2_colour, label_size=22) feature = SeqFeature(FeatureLocation(forward_primer_start, circrna_length), strand=-1) gds_features.add_feature(feature, name="Product", label=False, color="#6881ff") feature = SeqFeature(FeatureLocation(0, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Product: " + product_size + "bp", label=False, color="#6881ff", label_size=22, label_position="middle") if self.junction == "f": feature = SeqFeature(FeatureLocation( reverse_primer_start - reverse_primer_length, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) # the primer spans the BSJ, therefore we have to draw it in two pieces: # piece 1: primer start to circRNA end # piece 2: remaining primer portion beginning from 0 # piece 1: feature = SeqFeature( FeatureLocation(forward_primer_start, circrna_length)) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) # piece 2: feature = SeqFeature( FeatureLocation( 0, forward_primer_length - (circrna_length - forward_primer_start))) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) elif self.junction == "r": # the primer spans the BSJ, therefore we have to draw it in two pieces: # piece 1: primer start of circRNA to circRNA end # piece 2: remaining primer portion beginning from 0 # piece 1: feature = SeqFeature(FeatureLocation( circrna_length - reverse_primer_start, circrna_length), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) # piece 2: feature = SeqFeature(FeatureLocation( 0, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) feature = SeqFeature( FeatureLocation( forward_primer_start, forward_primer_start + forward_primer_length)) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) else: feature = SeqFeature(FeatureLocation( reverse_primer_start - reverse_primer_length, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) feature = SeqFeature( FeatureLocation( forward_primer_start, forward_primer_start + forward_primer_length)) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) feature = SeqFeature(FeatureLocation(0, 1)) gds_features.add_feature(feature, name="BSJ", label=True, color="white", label_size=22) if circular_rna_id in flanking_exon_cache: for exon in flanking_exon_cache[circular_rna_id]: exon_start, exon_stop = exon.split('_') exon_start = int(exon_start) - int(entry[2]) exon_stop = int(exon_stop) - int(entry[2]) feature = SeqFeature(FeatureLocation( exon_start, exon_stop), strand=+1) gds_features.add_feature(feature, name="Exon", label=False, color="grey", label_size=22) gdd.draw(format='circular', pagesize=(600, 600), circle_core=0.6, track_size=0.3, tracklines=0, x=0.00, y=0.00, start=0, end=circrna_length - 1) gdd.write( self.output_dir + "/" + circular_rna_id_isoform + ".svg", "svg")
def write_schemadelica_plot(self, path='./'): logger.info('Writing plot') gd_diagram = GenomeDiagram.Diagram("Primer Scheme", track_size=1) scale_track = GenomeDiagram.Track(name='scale', scale=True, scale_fontsize=10, scale_largetick_interval=1000, height=0.1) gd_diagram.add_track(scale_track, 2) primer_feature_set_1 = GenomeDiagram.FeatureSet() primer_feature_set_2 = GenomeDiagram.FeatureSet() for r in self.regions: cols1 = [ self.primary_reference.id, r.top_pair.left.start, r.top_pair.left.end, r.top_pair.left.name, r.pool ] cols2 = [ self.primary_reference.id, r.top_pair.right.end, r.top_pair.right.start, r.top_pair.right.name, r.pool ] region = str(r.region_num) fwd_feature = SeqFeature( FeatureLocation(int(cols1[1]), int(cols1[2]), strand=0)) rev_feature = SeqFeature( FeatureLocation(int(cols2[1]), int(cols2[2]), strand=0)) region_feature = SeqFeature( FeatureLocation(int(cols1[1]), int(cols2[2]), strand=0)) if int(region) % 2 == 0: primer_feature_set_1.add_feature(region_feature, color=colors.palevioletred, name=region, label=True, label_size=10, label_position="middle", label_angle=0) primer_feature_set_1.add_feature(fwd_feature, color=colors.red, name=region, label=False) primer_feature_set_1.add_feature(rev_feature, color=colors.red, name=region, label=False) else: primer_feature_set_2.add_feature(region_feature, color=colors.palevioletred, name=region, label=True, label_size=10, label_position="middle", label_angle=0) primer_feature_set_2.add_feature(fwd_feature, color=colors.red, name=region, label=False) primer_feature_set_2.add_feature(rev_feature, color=colors.red, name=region, label=False) primer_track = GenomeDiagram.Track(name="Annotated Features", height=0.1) primer_track.add_set(primer_feature_set_1) gd_diagram.add_track(primer_track, 4) primer_track = GenomeDiagram.Track(name="Annotated Features", height=0.1) primer_track.add_set(primer_feature_set_2) gd_diagram.add_track(primer_track, 6) rows = max(2, int(round(len(self.primary_reference) / 10000.0))) gd_diagram.draw(format='linear', pagesize=(300 * rows, 200 * rows), fragments=rows, start=0, end=len(self.primary_reference)) png_filepath = os.path.join(path, '{}.png'.format(self.prefix)) pdf_filepath = os.path.join(path, '{}.pdf'.format(self.prefix)) svg_filepath = os.path.join(path, '{}.svg'.format(self.prefix)) gd_diagram.write(png_filepath, 'PNG', dpi=300) gd_diagram.write(pdf_filepath, 'PDF', dpi=300) gd_diagram.write(svg_filepath, 'SVG', dpi=300)
def export_plasmidmap(gbfile, filename=None): """ Export Linear and Circular Plasmid Map for the imported GenBank file. :param gbfile: a genbank file in .gb format or the path the file if not in the same folder. :type gbfile: str :param filename: the filenames/path to the filenames for the linear and circular plasmids in tuple :type filename: tuple, optional :return: the version from the genbank file :rtype: str """ record = SeqIO.read(gbfile, "genbank") gd_diagram = GenomeDiagram.Diagram(record.id) gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") gd_feature_set = gd_track_for_features.new_set() for feature in record.features: if feature.type == "primer" or (feature.type == "misc_feature"): continue # if (feature.type != "CDS"): # # Exclude this feature # continue if len(gd_feature_set) % 2 == 0: color = colors.lightblue else: color = colors.blue gd_feature_set.add_feature( feature, sigil="ARROW", color=color, label_size=12, label_angle=0, label=True, ) # Draw Linear map from genbank gd_diagram.draw( format="linear", orientation="landscape", pagesize="A4", fragments=4, start=0, end=len(record), ) if filename is None: linfile = "plasmid_linear.png" circfile = "plasmid_circular.png" else: linfile = filename[0] circfile = filename[1] gd_diagram.write(linfile, "PNG") # Draw circular map from genbank gd_diagram.draw( format="circular", circular=True, pagesize=(25 * cm, 20 * cm), # pagesize=(35 * cm, 30 * cm), start=0, end=len(record), circle_core=0.5, ) # gd_diagram.write("plasmid_circular.pdf", "PDF") gd_diagram.write(circfile, "PNG") return record.id
scaffold2orf2coord[ scaffold ][orf] = (int(start),int(end),int(strand)) print(scaffold2orf2coord[ scaffold ][orf]) file.close() from reportlab.lib import colors from reportlab.lib.units import cm from Bio.Graphics import GenomeDiagram from Bio import SeqIO for scaffold,orf2coord in scaffold2orf2coord.items() : print(scaffold) scaffold_filename = scaffold+'_genomicDiagram.pdf' gd_diagram = GenomeDiagram.Diagram("a new type of nitrogenase") gd_track_for_features = gd_diagram.new_track(1, name="Annotated Features") gd_feature_set = gd_track_for_features.new_set() # print('\t'+str(scaffold2max[scaffold])) # print('\t'+str(scaffold2min[scaffold])) for orf,coord in orf2coord.items() : start = coord[0] end = coord[1] strand = coord[2] feature = SeqFeature( FeatureLocation( int( int(start) ) , int( int(end) ) ) , strand=int(strand), type = 'CDS' ) if orf in orf2color : color = orf2color[orf] else:
from Bio.Graphics import GenomeDiagram from reportlab.lib import colors from reportlab.lib.units import cm from Bio import SeqIO, SeqFeature from Bio.SeqFeature import SeqFeature, FeatureLocation #this code is from https://biopython-tutorial.readthedocs.io/en/latest/notebooks/17%20-%20Graphics%20including%20GenomeDiagram.html #I used it, and that web page, to learn how to use GenomeDiagram from Biopython #define parameters of drawing gdd = GenomeDiagram.Diagram('Test Diagram') gdt_features = gdd.new_track(1, greytrack=False) gds_features = gdt_features.new_set() #Add three features to show the strand options, feature = SeqFeature(FeatureLocation(25, 125), strand=+1) gds_features.add_feature(feature, name="Forward", label=True, sigil="ARROW") feature = SeqFeature(FeatureLocation(150, 250), strand=None) gds_features.add_feature(feature, name="Strandless", label=True, sigil="ARROW") feature = SeqFeature(FeatureLocation(275, 375), strand=-1) gds_features.add_feature(feature, name="Reverse", label=True, sigil="ARROW") #draw and save diagram gdd.draw(format='linear', pagesize=(15 * cm, 4 * cm), fragments=1, start=0, end=400) gdd.write("GD_labels_default.png", "png")
# BLAST SeqIO.convert(A, "genbank", A + ".fasta", "fasta") SeqIO.convert(B, "genbank", B + ".fasta", "fasta") comando_blastn = NcbiblastnCommandline( \ query=A+".fasta", subject=B+".fasta", \ outfmt='6 qstart qend sstart send pident',\ out="blast_"+A+"_"+B+".txt") stdout, stderr = comando_blastn() blast = open("blast_" + A + "_" + B + ".txt") # Iniciando a figura name = A + "_" + B gd = GenomeDiagram.Diagram(name) gA = gd.new_track(1,name="A",height=0.5, \ start=0,end=len(A1)) gA1 = gA.new_set() gB = gd.new_track(3,name="B",height=0.5, \ start=0,end=len(B1)) gB1 = gB.new_set() # Cores CDSs - intercalado c1 = "#79B134" c2 = "#8DE91D" # Colore um quadrado para cada CDS do arquivo A cont = 1 for i in A1.features:
p5 = [] p6 = [] for p,pp in zip(p1,p2): for ppp,pppp in zip(p3,p4): if ppp < p < pppp: p5.append(p) p6.append(pppp) elif p < ppp < pp: p5.append(ppp) p6.append(pp) for p,pp, i in zip(p5,p6,range(1,len(p5)+1)): record.features.append(SeqFeature(location = FeatureLocation(p, pp, strand = None), type = 'overlap', id = 'o{}'.format(i))) gd_diagram = GenomeDiagram.Diagram("Promoter region with GRE and TF binding") gd_track_for_features = gd_diagram.new_track(1, name="GREs and TFBS") gd_feature_set = gd_track_for_features.new_set() for feature in record.features: if feature.type == "GRE": color = colors.green gd_feature_set.add_feature(feature, color = color, label = feature.id, label_size=12, label_angle=20, sigil = "OCTO") if feature.type == "TF": color = colors.red gd_feature_set.add_feature(feature, color = color, label = feature.id, label_size=12, label_angle=20, sigil = "OCTO") # else: # gd_feature_set.add_feature(feature, color = colors.yellow, label = False) gd_diagram.draw(format="linear", orientation="landscape", pagesize=(300,80), fragments=1, start=0, end=len(record))