def find_cds (): seq_des = str(record_dict[keys].description).split("|") for i in seq_des: if re.match("CDS", i): feature, cds_start, cds_end = re.split(":|-", i) f = FeatureLocation(int(cds_start)-1, int(cds_end)) cds_sequence = f.extract(record_dict[keys].seq) protein_sequence = cds_sequence.translate() if "*" not in protein_sequence: return 0 else return 1 else return 0
def find_cds (): seq_des = str(record_dict[keys].description).split("|") if any("CDS:" in s for s in seq_des): for des in seq_des: match = re.match("CDS:", des) if match is not None: print record_dict[keys].id feature, cds_start, cds_end = re.split(":|-", des) f = FeatureLocation(int(cds_start)-1, int(cds_end)) cds_sequence = f.extract(record_dict[keys].seq) protein_sequence = cds_sequence.translate() if "*" not in protein_sequence: return 3 else: return 1 else: return 0
def find_gaps(ppr, mingap=30, maxgap=None, skip_introns=True): """Find all the gaps between PPR motifs which are gte maxgap""" loc = [] feats = sorted(ppr.features, key = lambda(p): int(p.location.start)) for a,b in pairwise(feats): #ignore any gaps if a and b aren't in the same frame if a.qualifiers['frame'] != b.qualifiers['frame']: continue #if the size is within the range l = int(b.location.start) - int(a.location.end) if l >= (mingap or -float('inf')) and l <= (maxgap or float('inf')): #We've found a gap g = FeatureLocation(int(a.location.end), int(b.location.start), strand=1) g.prev = a g.next = b loc.append(g) return loc
def __init__(self, logger, sequences, reference, dateFormat): super(sequence_set, self).__init__() self.log = logger # load sequences from the (parsed) JSON - don't forget to sort out dates self.seqs = {} for name, data in sequences.items(): self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna), id=name, name=name, description=name) self.seqs[name].attributes = data["attributes"] # tidy up dates date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat) self.seqs[name].attributes["num_date"] = date_struc[1] self.seqs[name].attributes["date"] = date_struc[2] # if the reference is to be analysed it'll already be in the (filtered & subsampled) # sequences, so no need to add it here, and no need to care about attributes etc # we do, however, need it for alignment self.reference_in_dataset = reference["included"] name = reference["strain"] self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna), id=name, name=name, description=name) if "genes" in reference and len(reference["genes"]): self.proteins = {} for k, v in reference["genes"].items(): feature = FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"]) # Translate sequences to identify any proteins ending with a stop codon. translation = Seq.translate(Seq(feature.extract(str(self.reference_seq.seq)))) if translation.endswith("*"): # Truncate the last codon of the protein to omit the stop codon. feature = FeatureLocation(start=v["start"], end=v["end"] - 3, strand=v["strand"]) self.proteins[k] = feature else: self.proteins = None # other things: self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))]) self.nthreads = 2 # should load from config file
def test_diagram_via_methods_pdf(self): """Construct and draw PDF using method approach.""" genbank_entry = self.record gdd = Diagram("Test Diagram") # Add a track of features, gdt_features = gdd.new_track(1, greytrack=True, name="CDS Features", greytrack_labels=0, height=0.5) # We'll just use one feature set for the genes and misc_features, gds_features = gdt_features.new_set() for feature in genbank_entry.features: if feature.type == "gene": if len(gds_features) % 2 == 0: color = "blue" else: color = "lightblue" gds_features.add_feature( feature, color=color, # label_position="middle", # label_position="end", label_position="start", label_size=11, # label_angle=90, sigil="ARROW", label=True, ) # I want to include some strandless features, so for an example # will use EcoRI recognition sites etc. for site, name, color in [ ("GAATTC", "EcoRI", "green"), ("CCCGGG", "SmaI", "orange"), ("AAGCTT", "HindIII", "red"), ("GGATCC", "BamHI", "purple"), ]: index = 0 while True: index = genbank_entry.seq.find(site, start=index) if index == -1: break feature = SeqFeature(FeatureLocation(index, index + 6), strand=None) # This URL should work in SVG output from recent versions # of ReportLab. You need ReportLab 2.4 or later try: url = ("http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi" + "?db=protein&id=%s" % feature.qualifiers["protein_id"][0]) except KeyError: url = None gds_features.add_feature( feature, color=color, url=url, # label_position="middle", label_size=10, label_color=color, # label_angle=90, name=name, label=True, ) index += len(site) del index # Now add a graph track... gdt_at_gc = gdd.new_track(2, greytrack=True, name="AT and GC content", greytrack_labels=True) gds_at_gc = gdt_at_gc.new_set(type="graph") step = len(genbank_entry) // 200 gds_at_gc.new_graph( apply_to_window(genbank_entry.seq, step, calc_gc_content, step), "GC content", style="line", color=colors.lightgreen, altcolor=colors.darkseagreen, ) gds_at_gc.new_graph( apply_to_window(genbank_entry.seq, step, calc_at_content, step), "AT content", style="line", color=colors.orange, altcolor=colors.red, ) # Finally draw it in both formats, gdd.draw( format="linear", orientation="landscape", tracklines=0, pagesize="A4", fragments=3, ) output_filename = os.path.join("Graphics", "GD_by_meth_linear.pdf") gdd.write(output_filename, "PDF") gdd.draw( format="circular", tracklines=False, circle_core=0.8, pagesize=(20 * cm, 20 * cm), circular=True, ) output_filename = os.path.join("Graphics", "GD_by_meth_circular.pdf") gdd.write(output_filename, "PDF")
def test_diagram_via_object_pdf(self): """Construct and draw PDF using object approach.""" genbank_entry = self.record gdd = Diagram("Test Diagram") gdt1 = Track( "CDS features", greytrack=True, scale_largetick_interval=1e4, scale_smalltick_interval=1e3, greytrack_labels=10, greytrack_font_color="red", scale_format="SInt", ) gdt2 = Track("gene features", greytrack=1, scale_largetick_interval=1e4) # First add some feature sets: gdfsA = FeatureSet(name="CDS backgrounds") gdfsB = FeatureSet(name="gene background") gdfs1 = FeatureSet(name="CDS features") gdfs2 = FeatureSet(name="gene features") gdfs3 = FeatureSet(name="misc_features") gdfs4 = FeatureSet(name="repeat regions") prev_gene = None cds_count = 0 for feature in genbank_entry.features: if feature.type == "CDS": cds_count += 1 if prev_gene: # Assuming it goes with this CDS! if cds_count % 2 == 0: dark, light = colors.peru, colors.tan else: dark, light = colors.burlywood, colors.bisque # Background for CDS, a = gdfsA.add_feature( SeqFeature( FeatureLocation(feature.location.start, feature.location.end, strand=0)), color=dark, ) # Background for gene, b = gdfsB.add_feature( SeqFeature( FeatureLocation( prev_gene.location.start, prev_gene.location.end, strand=0, )), color=dark, ) # Cross link, gdd.cross_track_links.append(CrossLink(a, b, light, dark)) prev_gene = None if feature.type == "gene": prev_gene = feature # Some cross links on the same linear diagram fragment, f, c = fill_and_border(colors.red) a = gdfsA.add_feature(SeqFeature(FeatureLocation(2220, 2230)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(2200, 2210)), color=f, border=c) gdd.cross_track_links.append(CrossLink(a, b, f, c)) f, c = fill_and_border(colors.blue) a = gdfsA.add_feature(SeqFeature(FeatureLocation(2150, 2200)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(2220, 2290)), color=f, border=c) gdd.cross_track_links.append(CrossLink(a, b, f, c, flip=True)) f, c = fill_and_border(colors.green) a = gdfsA.add_feature(SeqFeature(FeatureLocation(2250, 2560)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(2300, 2860)), color=f, border=c) gdd.cross_track_links.append(CrossLink(a, b, f, c)) # Some cross links where both parts are saddling the linear diagram fragment boundary, f, c = fill_and_border(colors.red) a = gdfsA.add_feature(SeqFeature(FeatureLocation(3155, 3250)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(3130, 3300)), color=f, border=c) gdd.cross_track_links.append(CrossLink(a, b, f, c)) # Nestled within that (drawn on top), f, c = fill_and_border(colors.blue) a = gdfsA.add_feature(SeqFeature(FeatureLocation(3160, 3275)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(3180, 3225)), color=f, border=c) gdd.cross_track_links.append(CrossLink(a, b, f, c, flip=True)) # Some cross links where two features are on either side of the linear diagram fragment boundary, f, c = fill_and_border(colors.green) a = gdfsA.add_feature(SeqFeature(FeatureLocation(6450, 6550)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(6265, 6365)), color=f, border=c) gdd.cross_track_links.append(CrossLink(a, b, color=f, border=c)) f, c = fill_and_border(colors.gold) a = gdfsA.add_feature(SeqFeature(FeatureLocation(6265, 6365)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(6450, 6550)), color=f, border=c) gdd.cross_track_links.append(CrossLink(a, b, color=f, border=c)) f, c = fill_and_border(colors.red) a = gdfsA.add_feature(SeqFeature(FeatureLocation(6275, 6375)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(6430, 6530)), color=f, border=c) gdd.cross_track_links.append( CrossLink(a, b, color=f, border=c, flip=True)) f, c = fill_and_border(colors.blue) a = gdfsA.add_feature(SeqFeature(FeatureLocation(6430, 6530)), color=f, border=c) b = gdfsB.add_feature(SeqFeature(FeatureLocation(6275, 6375)), color=f, border=c) gdd.cross_track_links.append( CrossLink(a, b, color=f, border=c, flip=True)) cds_count = 0 for feature in genbank_entry.features: if feature.type == "CDS": cds_count += 1 if cds_count % 2 == 0: gdfs1.add_feature(feature, color=colors.pink, sigil="ARROW") else: gdfs1.add_feature(feature, color=colors.red, sigil="ARROW") if feature.type == "gene": # Note we set the colour of ALL the genes later on as a test, gdfs2.add_feature(feature, sigil="ARROW") if feature.type == "misc_feature": gdfs3.add_feature(feature, color=colors.orange) if feature.type == "repeat_region": gdfs4.add_feature(feature, color=colors.purple) # gdd.cross_track_links = gdd.cross_track_links[:1] gdfs1.set_all_features("label", 1) gdfs2.set_all_features("label", 1) gdfs3.set_all_features("label", 1) gdfs4.set_all_features("label", 1) gdfs3.set_all_features("hide", 0) gdfs4.set_all_features("hide", 0) # gdfs1.set_all_features('color', colors.red) gdfs2.set_all_features("color", colors.blue) gdt1.add_set(gdfsA) # Before CDS so under them! gdt1.add_set(gdfs1) gdt2.add_set(gdfsB) # Before genes so under them! gdt2.add_set(gdfs2) gdt3 = Track("misc features and repeats", greytrack=1, scale_largetick_interval=1e4) gdt3.add_set(gdfs3) gdt3.add_set(gdfs4) # Now add some graph sets: # Use a fairly large step so we can easily tell the difference # between the bar and line graphs. step = len(genbank_entry) // 200 gdgs1 = GraphSet("GC skew") graphdata1 = apply_to_window(genbank_entry.seq, step, calc_gc_skew, step) gdgs1.new_graph( graphdata1, "GC Skew", style="bar", color=colors.violet, altcolor=colors.purple, ) gdt4 = Track("GC Skew (bar)", height=1.94, greytrack=1, scale_largetick_interval=1e4) gdt4.add_set(gdgs1) gdgs2 = GraphSet("GC and AT Content") gdgs2.new_graph( apply_to_window(genbank_entry.seq, step, calc_gc_content, step), "GC content", style="line", color=colors.lightgreen, altcolor=colors.darkseagreen, ) gdgs2.new_graph( apply_to_window(genbank_entry.seq, step, calc_at_content, step), "AT content", style="line", color=colors.orange, altcolor=colors.red, ) gdt5 = Track( "GC Content(green line), AT Content(red line)", height=1.94, greytrack=1, scale_largetick_interval=1e4, ) gdt5.add_set(gdgs2) gdgs3 = GraphSet("Di-nucleotide count") step = len(genbank_entry) // 400 # smaller step gdgs3.new_graph( apply_to_window(genbank_entry.seq, step, calc_dinucleotide_counts, step), "Di-nucleotide count", style="heat", color=colors.red, altcolor=colors.orange, ) gdt6 = Track("Di-nucleotide count", height=0.5, greytrack=False, scale=False) gdt6.add_set(gdgs3) # Add the tracks (from both features and graphs) # Leave some white space in the middle/bottom gdd.add_track(gdt4, 3) # GC skew gdd.add_track(gdt5, 4) # GC and AT content gdd.add_track(gdt1, 5) # CDS features gdd.add_track(gdt2, 6) # Gene features gdd.add_track(gdt3, 7) # Misc features and repeat feature gdd.add_track(gdt6, 8) # Feature depth # Finally draw it in both formats, and full view and partial gdd.draw(format="circular", orientation="landscape", tracklines=0, pagesize="A0") output_filename = os.path.join("Graphics", "GD_by_obj_circular.pdf") gdd.write(output_filename, "PDF") gdd.circular = False gdd.draw( format="circular", orientation="landscape", tracklines=0, pagesize="A0", start=3000, end=6300, ) output_filename = os.path.join("Graphics", "GD_by_obj_frag_circular.pdf") gdd.write(output_filename, "PDF") gdd.draw( format="linear", orientation="landscape", tracklines=0, pagesize="A0", fragments=3, ) output_filename = os.path.join("Graphics", "GD_by_obj_linear.pdf") gdd.write(output_filename, "PDF") gdd.set_all_tracks("greytrack_labels", 2) gdd.draw( format="linear", orientation="landscape", tracklines=0, pagesize=(30 * cm, 10 * cm), fragments=1, start=3000, end=6300, ) output_filename = os.path.join("Graphics", "GD_by_obj_frag_linear.pdf") gdd.write(output_filename, "PDF")
import os in_files = glob.glob('genomes/*.embl') flanking_region = 100 try: os.mkdir("results") except OSError: print "a 'results' dir already exists" print "Overwriting" stored = {} for f in in_files: cur_genome = SeqIO.parse(f, "embl") #print cur_genome for record in cur_genome: for feat in record.features: if feat.type == 'mobile_element': s, e, strand = feat.location.start, feat.location.end, feat.location.strand header = '>'+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+","+feat.qualifiers['mobile_element_type'][0].split(':')[-1]+".."+str(s+1)+".."+str(e)+"("+str(strand)+"),""100bp flanked,[EC958 IS]" flanked = FeatureLocation(s-flanking_region, e+flanking_region, strand) out_seq = flanked.extract(record.seq) fname = header[1:].split(',')[0].replace('unclassified','unc').replace('family', 'fam').replace('(', '').replace('partial', 'p').replace(')', '').replace(' ', '_').replace('/', '-').strip()+'.fna' if fname in stored.keys(): old = fname fname = fname.replace(".fna", "_"+str(stored[fname])+".fna") stored[old] = stored[old]+1 else: stored[fname] = 1 with open(os.path.join('results', fname), 'w') as out: out.write(header+'\n') out.write(str(out_seq)+'\n')
def lipoP_gff(lipoIn, gff3In, jBrowseOut): orgIDs = {} orgID = "" # Take and parse the txt output into a sequence of records # Dict of X records, with the ID as key and an array Y of each cleavage site as the value, for row in lipoIn: if row.startswith("#"): orgID = "" continue rowElem = row.split("\t") orgID = rowElem[0] if rowElem[2] == "CleavII": if not (orgID in orgIDs.keys()): orgIDs[orgID] = [] orgIDs[orgID].append(int(rowElem[3])) # , int(rowElem[4]))) # Rebase for gff in GFF.parse(gff3In): keepSeq = [] for xRec in gff.features: cdss = list( feature_lambda( xRec.sub_features, feature_test_type, {"type": "CDS"}, subfeatures=False, )) findCleave = "" cdsOff = 0 for cds in cdss: if cds.id in orgIDs: findCleave = cds.id break cdsOff += 1 if findCleave == "": if not jBrowseOut: keepSeq.append(xRec) continue if jBrowseOut: xRec.sub_features = [] i = 0 for cleaveBase in orgIDs[findCleave]: tempQuals = xRec.qualifiers.copy() i += 1 tempQuals["ID"] = xRec.id + "_cleavage_" + str(i) xRec.sub_features.append( SeqFeature( FeatureLocation( cdss[cdsOff].location.start + (cleaveBase * 3) - 1, cdss[cdsOff].location.start + (cleaveBase * 3) + 1, ), type="cleavage_site", strand=xRec.location.strand, qualifiers=tempQuals, )) keepSeq.append(xRec) gff.features = keepSeq GFF.write([gff], sys.stdout)
def create_regions(self, superclusters: List[SuperCluster] = None, subregions: List[SubRegion] = None) -> int: """ Creates Region features based on contained SuperClusters and SubRegions and returns the number of regions created. Regions will not overlap. If supplied, parameters will override the Records own superclusters and subregions. """ if superclusters is None: superclusters = self._superclusters if subregions is None: subregions = self._subregions if not superclusters and not subregions: return 0 areas = [] # type: List[CDSCollection] areas.extend(superclusters) areas.extend(subregions) areas.sort() region_location = FeatureLocation( max(0, areas[0].location.start), min(areas[0].location.end, len(self))) supers = [] subs = [] if isinstance(areas[0], SuperCluster): supers.append(areas[0]) else: assert isinstance(areas[0], SubRegion), type(areas[0]) subs.append(areas[0]) regions_added = 0 for area in areas[1:]: if area.overlaps_with(region_location): region_location = combine_locations(area.location, region_location) if isinstance(area, SuperCluster): supers.append(area) else: assert isinstance(area, SubRegion), type(area) subs.append(area) continue # no overlap means new region self.add_region(Region(supers, subs)) regions_added += 1 region_location = area.location supers = [] subs = [] if isinstance(area, SuperCluster): supers.append(area) else: assert isinstance(area, SubRegion), type(area) subs.append(area) # add the final region being built self.add_region(Region(supers, subs)) regions_added += 1 return regions_added
def test_get_cluster_type(self): "Test utils.get_cluster_type()" cluster = FakeFeature('cluster', FeatureLocation(23, 42), {'product': ['fake']}) self.assertEqual('fake', utils.get_cluster_type(cluster))
def create_gff(self, nested_element, dirpath, output_fasta_offset, format='default'): if format not in format_dict: format = 'default' self._create_dirs(nested_element.id, dirpath) #TODO move to separate method # find closest parent nl = nested_element.nested_list parents = [-1] * len(nl) for i in range(len(parents) - 1): for j in range(i + 1, len(parents)): if intervals.contains(nl[j].location, nl[i].location): parents[i] = j break # append direct children direct_children = [[] for i in range(len(nl))] for i in reversed(range(len(direct_children))): parent = parents[i] if parent != -1: direct_children[parent].append(nl[i].location) # GFF rec = SeqRecord(nested_element.sequence, nested_element.id) features = [] for i in range(len(nl)): #insert baseline base_type = format_dict[format]['te_base'] if format != 'default' else 'te_base' features.append(SeqFeature( FeatureLocation( (nl[i].location[0]-1), nl[i].location[1]), type=base_type, strand=0, qualifiers={ 'name': 'TE_BASE {}'.format(i), 'ID': 'TE_BASE {}'.format(i) } )) #insert element cropped by its children subseq = Seq('') children = direct_children[i] cropped = intervals.crop(nl[i].location, children) for subinterval in cropped: subseq += nested_element.sequence[subinterval[0] : subinterval[1]] te_type = format_dict[format]['te'] if format != 'default' else 'te' features.append(SeqFeature( FeatureLocation((subinterval[0]-1), subinterval[1]), type=te_type, strand=0, qualifiers={ 'ID': 'TE {}'.format(i), 'name': 'TE {}'.format(i), 'Parent': 'TE_BASE {}'.format(i) } )) # save transposon fasta subseq = ( nested_element.sequence[(nl[i].location[0] - output_fasta_offset) : nl[i].location[0]] + subseq + nested_element.sequence[nl[i].location[1] : (nl[i].location[1] + output_fasta_offset)]) with open('{}/{}/TE/{}.fa'.format(dirpath, nested_element.id, i), 'w') as fasta_out: SeqIO.write( SeqRecord(subseq, id='{}|TE-{}'.format(nested_element.id, i), description='Cropped nested retrotransposon'), fasta_out, 'fasta' ) if len(cropped) > 1: subseq = nested_element.sequence[(nl[i].location[0] - output_fasta_offset) : (nl[i].location[1] + output_fasta_offset)] with open('{}/{}/TE/{}_full.fa'.format(dirpath, nested_element.id, i), 'w') as fasta_out: SeqIO.write( SeqRecord(subseq, id='{}|TE-{}'.format(nested_element.id, i), description='Cropped nested retrotransposon'), fasta_out, 'fasta' ) # insert domains if 'domains' in nl[i].features: j = 0 for domain in nl[i].features['domains']: domain_location = domain.location sign = (lambda x: x and (1, -1)[x < 0])(domain.frame[0]) if sign < 0: domain_location = [domain_location[1], domain_location[0]] overlap = [x for x in children if intervals.contains(domain_location, x)] cropped_domain = intervals.crop(domain_location, overlap) for part in cropped_domain: domain_type = format_dict[format]['domain'] if format != 'default' else domain.type features.append(SeqFeature( FeatureLocation(part[0] - 1, part[1]), type=domain_type, strand=sign, qualifiers={ 'ID': 'DOMAIN {}-{}'.format(i, j), 'name': domain.type, 'Parent': 'TE_BASE {}'.format(i) } )) j += 1 #insert pbs,ppt if 'pbs' in nl[i].features and not math.isnan(nl[i].features['pbs'][0]): pbs_tybe = format_dict[format]['pbs'] if format != 'default' else 'pbs' features.append(SeqFeature( FeatureLocation(nl[i].features['pbs'][0] - 1, nl[i].features['pbs'][1]), type=pbs_tybe, strand=0, qualifiers={ 'ID': 'PBS {}'.format(i), 'name': 'pbs', 'Parent': 'TE_BASE {}'.format(i) } )) if 'ppt' in nl[i].features and not math.isnan(nl[i].features['ppt'][0]): ppt_type = format_dict[format]['ppt'] if format != 'default' else 'ppt' features.append(SeqFeature( FeatureLocation(nl[i].features['ppt'][0] - 1, nl[i].features['ppt'][1]), type=ppt_type, strand=0, qualifiers={ 'ID': 'PPT {}'.format(i), 'name': 'ppt', 'Parent': 'TE_BASE {}'.format(i) } )) #insert ltrs ltr_type = format_dict[format]['ltr'] if format != 'default' else 'ltr' features.append(SeqFeature( FeatureLocation(nl[i].ltr_right_location[0] - 1, nl[i].ltr_right_location[1]), type=ltr_type, strand=0, qualifiers={ 'ID': 'LTR RIGHT {}'.format(i), 'name': 'ltr right', 'Parent': 'TE_BASE {}'.format(i) } )) features.append(SeqFeature( FeatureLocation(nl[i].ltr_left_location[0] - 1, nl[i].ltr_left_location[1]), type=ltr_type, strand=0, qualifiers={ 'ID': 'LTR LEFT {}'.format(i), 'name': 'ltr left', 'Parent': 'TE_BASE {}'.format(i) } )) #insert tsrs if not math.isnan(nl[i].tsr_left[0]): tsr_type = format_dict[format]['tsr'] if format != 'default' else 'tsr' features.append(SeqFeature( FeatureLocation(nl[i].tsr_left[0] - 1, nl[i].tsr_left[1]), type=tsr_type, strand=0, qualifiers={ 'ID': 'TSR LEFT {}'.format(i), 'name': 'tsr left', 'Parent': 'TE_BASE {}'.format(i) } )) features.append(SeqFeature( FeatureLocation(nl[i].tsr_right[0] - 1, nl[i].tsr_right[1]), type=tsr_type, strand=0, qualifiers={ 'ID': 'TSR RIGHT {}'.format(i), 'name': 'tsr right', 'Parent': 'TE_BASE {}'.format(i) } )) # FOR END rec.features = features #create GFF filename = '{}/{}/{}'.format(dirpath, nested_element.id, nested_element.id) if format != 'default': filename += '_{}'.format(format) gff_filepath = '{}.gff'.format(filename) with open(gff_filepath, 'w+') as gff_out: GFF.write([rec], gff_out) return gff_filepath
def create_clusters_from_borders( self, borders: Optional[List[ClusterBorder]] = None) -> int: """ Takes all ClusterBorder instances and constructs Clusters that cover each ClusterBorder. If a cluster would overlap with another, the clusters are merged. Returns: the number of clusters created """ if borders is None: borders = self._cluster_borders if not borders: return 0 borders = sorted(borders) cluster_location = FeatureLocation( max(0, borders[0].location.start - borders[0].extent), min(borders[0].location.end + borders[0].extent, len(self))) # create without products initially, add based on the border naming attributes borders_within_cluster = [borders[0]] cluster = Cluster(cluster_location, borders[0].cutoff, borders[0].extent, []) if borders[0].rule: cluster.detection_rules.append(borders[0].rule) clusters_added = 0 for border in borders[1:]: dummy_border_location = FeatureLocation( max(0, border.location.start - border.extent), min(border.location.end + border.extent, len(self))) if cluster.overlaps_with(dummy_border_location): cluster.extent = max(cluster.extent, border.extent) cluster.cutoff = max(cluster.cutoff, border.cutoff) start = min(cluster.location.start, border.location.start - border.extent) if start < 0: start = 0 end = max(cluster.location.end, border.location.end + border.extent) if end > len(self): end = len(self) cluster.location = FeatureLocation(start, end) borders_within_cluster.append(border) if border.rule: cluster.detection_rules.append(border.rule) else: cluster.contig_edge = cluster.location.start == 0 or cluster.location.end == len( self.seq) for product in _build_products_from_borders( borders_within_cluster): cluster.add_product(product) self.add_cluster(cluster) borders_within_cluster.clear() clusters_added += 1 cluster_location = FeatureLocation( max(0, border.location.start - border.extent), min(border.location.end + border.extent, len(self))) cluster = Cluster(cluster_location, border.cutoff, border.extent, []) borders_within_cluster.append(border) if border.rule: cluster.detection_rules.append(border.rule) # add the final cluster being built if it wasn't added already cluster.contig_edge = cluster.location.start == 0 or cluster.location.end == len( self.seq) for product in _build_products_from_borders(borders_within_cluster): cluster.add_product(product) self.add_cluster(cluster) clusters_added += 1 return clusters_added
def test_eq_not_identical(self): """Test two different locations are not equal.""" loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = (FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) + FeatureLocation(50, 60, 1)) self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = FeatureLocation(12, 17, -1) + FeatureLocation(23, 42, -1) self.assertNotEqual(loc1, loc2) loc1 = CompoundLocation( [FeatureLocation(12, 17, 1), FeatureLocation(23, 42, 1)]) loc2 = CompoundLocation( [FeatureLocation(12, 17, 1), FeatureLocation(23, 42, 1)], "order") self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(12, 17, 1) + FeatureLocation(23, 42, 1) loc2 = 5 self.assertNotEqual(loc1, loc2)
def convert_xmfa_to_gff3(xmfa_file, relative_to='1', sequences=None, window_size=1000): label_convert = _id_tn_dict(sequences) lcbs = parse_xmfa(xmfa_file) records = [ SeqRecord(Seq("A"), id=label_convert.get(relative_to, relative_to)) ] for lcb in lcbs: ids = [seq['id'] for seq in lcb] # Doesn't match part of our sequence if relative_to not in ids: continue # Skip sequences that are JUST our "relative_to" genome if len(ids) == 1: continue parent = [seq for seq in lcb if seq['id'] == relative_to][0] others = [seq for seq in lcb if seq['id'] != relative_to] for other in others: other['feature'] = SeqFeature( FeatureLocation(parent['start'], parent['end'] + 1), type="match", strand=parent['strand'], qualifiers={ "source": "progressiveMauve", "target": label_convert.get(other['id'], other['id']), "ID": label_convert.get(other['id'], 'xmfa_' + other['rid']) }) for i in range(0, len(lcb[0]['seq']), window_size): block_seq = parent['seq'][i:i + window_size] real_window_size = len(block_seq) real_start = abs( parent['start']) - parent['seq'][0:i].count('-') + i real_end = real_start + real_window_size - block_seq.count('-') if (real_end - real_start) < 10: continue if parent['start'] < 0: strand = -1 else: strand = 1 for other in others: pid = _percent_identity(block_seq, other['seq'][i:i + real_window_size]) # Ignore 0% identity sequences if pid == 0: continue other['feature'].sub_features.append( SeqFeature(FeatureLocation(real_start, real_end), type="match_part", strand=strand, qualifiers={ "source": "progressiveMauve", 'score': pid })) for other in others: records[0].features.append(other['feature']) return records
def new_track(plot, record, track_num=1, name=None, start=None, end=None, links=list()): """ Shorthand for adding a new track to a plot. Keyword arguments: plot -- A GenomeDiagram.Diagram object record -- A SeqRecord object with features name -- The label for the track (default record.id) start -- Where to start from in the record (default 0) end -- Where to end in the record (default len(record)) Returns: None the plot object itself is updated. """ type_colours = { "gene": colors.Color(red=1/255, green=108/255, blue=154/255), "mnh120_REPET_SSRs": colors.Color(red=0, green=160/255, blue=138/255), "mnh120_REPET_TEs": colors.Color(red=249/255, green=132/255, blue=0) } if start is None: start = 0 if end is None: end = len(record) if name is None: name = record.id # set large tick interval if end - start > 200000: interval = 100000 elif 80000 < end - start <= 200000: interval = 50000 elif 20000 < end - start <= 80000: interval = 10000 else: interval = 5000 track = plot.new_track( track_num, name=name, greytrack=False, greytrack_labels=1, start=start, end=end, height=0.5, scale_fontsize=10, scale_largeticks=1.2, scale_largetick_interval=interval, ) track_features = track.new_set() for link, psim in links: colour = colors.Color(red=1, alpha=psim/1000) track_features.add_feature( SeqFeature( id='', location=FeatureLocation( link['start'], link['end'], strand=0 ) ), color=colour, border=colors.Color(red=1, alpha=0.) ) for feature in record.features: colour = type_colours[feature.type] if feature.type == 'gene': label_args = { 'label': True, 'label_position': 'start', 'label_angle': 90, 'label_size': 10, 'name': feature.id } else: label_args = dict() track_features.add_feature( feature, sigil="BIGARROW", arrowshaft_height=1.0, color=colour, **label_args ) # plot.draw(format="linear", pagesize='A5', fragments=1) # plot.write("filename", "PNG") return track
piece_2 = rec_genome_seq[swap_idx_2:swap_idx_2+swap_size] rec_genome_seq[swap_idx_1:swap_idx_1+swap_size] = piece_2 rec_genome_seq[swap_idx_2:swap_idx_2+swap_size] = piece_1 rec_genome = SeqRecord(rec_genome_seq, id='001', name='recoded_genome') SYNTH_SEG_SIZE = 49000 SYNTH_FRAG_SIZE = 2500 for seg_num in range(genome_length / SYNTH_SEG_SIZE): start_idx = seg_num * SYNTH_SEG_SIZE end_idx = start_idx + SYNTH_SEG_SIZE if seg_num == (genome_length / SYNTH_SEG_SIZE - 1): end_idx += genome_length % SYNTH_SEG_SIZE s_feat = SeqFeature(FeatureLocation(start_idx, end_idx), type='synth_segment', qualifiers={'label': 'seg%02d' % (seg_num)}) rec_genome.features.append(s_feat) for frag_num in range(SYNTH_SEG_SIZE / SYNTH_FRAG_SIZE): f_start_idx = start_idx + SYNTH_FRAG_SIZE * frag_num f_end_idx = f_start_idx + SYNTH_FRAG_SIZE if frag_num == (SYNTH_SEG_SIZE / SYNTH_FRAG_SIZE - 1): f_end_idx += SYNTH_SEG_SIZE % SYNTH_FRAG_SIZE else: f_end_idx += 100 # simulated overlap for assembly f_feat = SeqFeature(FeatureLocation(f_start_idx,f_end_idx), type='synth_fragment', qualifiers={'label': 'seg%02d_%03d' % (seg_num, frag_num)}) rec_genome.features.append(f_feat)
def predict_primer_set(self): predefined_sets = dict() if self.predefined_handle is not None: self.parse_predefined_pairs(predefined_sets) out_genes = [] for record in SeqIO.parse(self.input_handle, "fasta"): gene = Gene(record.id) sequence = str(record.seq) for i, sel_sequence in enumerate(re.split(r"//", sequence)): s = re.sub(r"[\[\]<>]", "", sel_sequence) amplicon = Amplicon(s) if record.id in predefined_sets: amplicon.primer_set = predefined_sets[record.id] gene.append(amplicon) del predefined_sets[record.id] continue input_string = "" input_string += "SEQUENCE_ID=" + record.id + "\n" input_string += "SEQUENCE_TEMPLATE=" + s + "\n" if sel_sequence.find("<") >= 0 and sel_sequence.find(">") >= 0: input_string += "SEQUENCE_EXCLUDED_REGION=" spl_sequence = re.split( r"[<>]", sel_sequence.replace("[", "").replace("]", "")) for i in range(0, len(spl_sequence) - 1, 2): start = 0 for j in range(0, i + 1): start += len(spl_sequence[j]) input_string += (str(start + 1) + "," + str(len(spl_sequence[i + 1])) + " ") amplicon.add_feature( ExcludedRegion( FeatureLocation( start + 1, start + len(spl_sequence[i + 1])))) input_string += "\n" sel_sequence = sel_sequence.replace("<", "") sel_sequence = sel_sequence.replace(">", "") if sel_sequence.find("[") >= 0 and sel_sequence.find("]") >= 0: input_string += "SEQUENCE_TARGET=" spl_sequence = re.split(r"[\[\]]", sel_sequence) for i_ in range(0, len(spl_sequence) - 1, 2): start = 0 for j in range(0, i_ + 1): start += len(spl_sequence[j]) input_string += (str(start + 1) + "," + str(len(spl_sequence[i_ + 1])) + " ") amplicon.add_feature( TargetRegion( FeatureLocation( start + 1, start + len(spl_sequence[i_ + 1])))) input_string += "\n" input_string += "P3_FILE_FLAG=0\n" # This is badly programmed and NEEDS that trailing slash input_string += f"PRIMER_THERMODYNAMIC_PARAMETERS_PATH={self.config.p3_thermo_path}/\n=" log.info(input_string) p = run_and_feed( self.config.p3_path, p3_settings_file=self.config.p3_config_path, _input_str=input_string, _long_arg_prefix="-", ) p3_output = p.stdout.strip() log.info("P3 output: %s", p3_output) m = re.search(r"(?<=PRIMER_ERROR=)\w+", p3_output) if m is not None: raise Exception( "Error for sequence (Probably no primer found in region): " f"{record.id}: {m.group(0)}\n Start NEW Primerprediction." ) primer_set = PrimerPairSet(record.id) parse_p3_information(primer_set, p3_output) if len(primer_set) == 0: log.warning("WARNING: No primer found for %s sequence %s.", record.id, i + 1) continue amplicon.primer_set = primer_set gene.append(amplicon) if len(gene) == 0: raise Exception( f"No primer found for {gene.name}. Consider less restrictive Primer3 settings." ) out_genes.append(gene) for key in predefined_sets: log.info( "WARNING: No input sequence could be found for the predefined primer %s", key, ) return out_genes
def table_annotations(gff3In, tabularIn, fastaIn, out_gff3, out_changelog): # CSV parse tabular header = csv.DictReader(tabularIn, delimiter="\t") for i in header.fieldnames: if i == "Boundary" and not ("bound_s" in header.fieldnames): header.fieldnames[header.fieldnames.index(i)] = "bound_s" elif i == "Boundary": header.fieldnames[header.fieldnames.index(i)] = "bound_e" # Else error elif i == "# Organism ID": header.fieldnames[header.fieldnames.index(i)] = "org_id" elif i == "User entered Notes": header.fieldnames[header.fieldnames.index(i)] = "Note" idDict = csv.DictReader(tabularIn, delimiter="\t", fieldnames=header.fieldnames) # BioPython parse GFF sourceG = list( GFF.parse(gff3In, SeqIO.to_dict(SeqIO.parse(fastaIn, "fasta")))) recG = [] recTest = [] sumFeatures = 0 numOrgs = 0 while numOrgs < len(sourceG): # Should be directly editable topFeat = 0 while topFeat < len(sourceG[numOrgs].features): subFeat1 = 0 recG.append(sourceG[numOrgs].features[topFeat]) sumFeatures += 1 while subFeat1 < len( sourceG[numOrgs].features[topFeat].sub_features): subFeat2 = 0 recG.append( sourceG[numOrgs].features[topFeat].sub_features[subFeat1]) sumFeatures += 1 while subFeat2 < len(sourceG[numOrgs].features[topFeat]. sub_features[subFeat1].sub_features): recG.append(sourceG[numOrgs].features[topFeat]. sub_features[subFeat1].sub_features[subFeat2]) sumFeatures += 1 subFeat2 += 1 subFeat1 += 1 topFeat += 1 numOrgs += 1 # Get a changelog ready out_changelog.write("ID\tChanges\tStatus\n") anyChange = False for row in idDict: if row["ID"] == "ID": continue # Skip header Found = False for i in recG: # if "Parent" in i.qualifiers: if row["ID"] == i.id: strandC = False startC = False endC = False nameC = False noteC = False qualC = False aliasC = False parentC = False Found = True for qual in row: if qual == "ID" or qual == "": continue if qual == "Strand": if row["Strand"] == "+": row["Strand"] = +1 else: row["Strand"] = -1 if qual == "Name" and row["Name"] != i.qualifiers["Name"][ 0]: i.qualifiers["Name"][0] = row["Name"] nameC = True elif qual == "Alias" and row["Alias"] != i.qualifiers[ "Alias"][0]: i.qualifiers["Alias"][0] = row["Alias"] aliasC = True elif (qual == "Parent" and row["Parent"] != i.qualifiers["Parent"][0]): i.qualifiers["Parent"][0] = row["Parent"] parentC = true # elif qual == "Note": elif qual == "Strand" and i.strand != row["Strand"]: strandC = True i.location = FeatureLocation(i.location.start, i.location.end, row["Strand"]) elif qual == "bound_s" and i.location.start != int( row["bound_s"]): startC = True i.location = FeatureLocation(int(row["bound_s"]), i.location.end, i.location.strand) elif qual == "bound_e" and i.location.end != int( row["bound_e"]): endC = True i.location = FeatureLocation(i.location.start, int(row["bound_e"]), i.location.strand) elif qual == "Note": temp = [str(row["Note"])] if ("Note" in i.qualifiers ) and i.qualifiers["Note"] != temp: if temp: i.qualifiers["Note"] = temp else: i.qualifiers.pop("Note", None) noteC = True elif temp != [""] and not ("Note" in i.qualifiers): i.qualifiers["Note"] = temp noteC = True elif not (qual in [ "Target", "Gap", "Dbxref", "Ontology_term", "Is_circular", "Derives_from", "bound_s", "bound_e", "org_id", "Strand", "Name", "Note", ]): temp = qual.lower().replace(" ", "_") if temp in i.qualifiers: # Edit if type(row[qual]) == type(None): i.qualifiers.pop(temp, None) qualC = True elif i.qualifiers[temp] != [str(row[qual])]: i.qualifiers[temp] = [str(row[qual])] qualC = True elif type(row[qual]) != type(None): # Create i.qualifiers[temp] = [str(row[qual])] qualC = True # print(i) # if "OrgID" in row and (row["OrgID"] != i.qualifiers["Name"][0]): # OrgID Seemingly not used aside from GFF Header # Location object needs to be rebuilt, can't individually set start/end changeList = "" if nameC: changeList += "Name" if aliasC: if changeList != "": changeList += ", " changeList += "Alias" if parentC: if changeList != "": changeList += ", " changeList += "Parent" if startC: if changeList != "": changeList += ", " changeList += "Start" if endC: if changeList != "": changeList += ", " changeList += "End" if strandC: if changeList != "": changeList += ", " changeList += "Strand" if noteC: if changeList != "": changeList += ", " changeList += "Notes" if qualC: if changeList != "": changeList += ", " changeList += "Other Qualifiers" if (changeList != "" ): # On success, write out replaced attributes and success out_changelog.write("%s\t%s\tSuccess\n" % (i.id, changeList)) anyChange = True else: # On fail, write out table line and why # No changes detected out_changelog.write("%s\tNone\tNo Change\n" % i.id) break if Found == False: # No such ID out_changelog.write("%s\tNone\tID not Found\n" % row["ID"]) if anyChange: sourceG[0].annotations = {} sourceG[0].features = [ x for x in sourceG[0].features if x.type != "remark" ] GFF.write(sourceG, out_gff3) else: out_changelog.write("GFF3\tNone\tGFF3 already equals Table\n") out_gff3 = gff3In out_changelog.close() out_gff3.close()
def draw_gene_diagram(self, inv_pair, genes, save_path): # print("Drawing gene diagram for", inv_pair) # define the tick interval based on the start and end of the genes (should already be ordered) track_start = genes[0].location.start track_end = genes[len(genes) - 1].location.end s_tick_int = int((track_end - track_start) / 5) # create an empty genome diagram gdd = GenomeDiagram.Diagram(self.accession_num) gdt_features = gdd.new_track(1, greytrack=True, scale_smalltick_interval=s_tick_int, scale_smalltick_labels=True, scale_smallticks=0.1, scale_fontangle=0, scale_fontsize=4, name=self.accession_num) gds_features = gdt_features.new_set() # for each loci, annotate the diagram for orf in genes: # describe the orf loctag = orf.qualifiers['locus_tag'][0] product = orf.qualifiers['product'][0] # define orientation based on strand if orf.strand == 1: angle = 15 pos = 'left' if orf.strand == -1: angle = -195 pos = 'right' # draw the orf gds_features.add_feature(orf, name=loctag + ": " + product, label=True, sigil="BIGARROW", label_size=4, arrowhead_length=0.2, label_angle=angle, label_position=pos, arrowshaft_height=0.3) # for the cluster, annotate inversion positions feature = SeqFeature(FeatureLocation(int(inv_pair[0]), int(inv_pair[0]) + 1), strand=0) gds_features.add_feature(feature, name=' START', label=True, color="purple", label_position="left", label_angle=45, sigil='BOX', label_color='purple', label_size=6) feature = SeqFeature(FeatureLocation(int(inv_pair[1]), int(inv_pair[1]) + 1), strand=0) gds_features.add_feature(feature, name=' END', label=True, color="purple", label_position="left", label_angle=45, sigil='BOX', label_color='purple', label_size=6) # draw and save the graph gdd.draw(format='linear', pagesize=(16 * cm, 10 * cm), fragments=1, start=track_start - 500, end=track_end + 500) gdd.write(save_path, "pdf") return
def test_eq_not_identical(self): """Test two different locations are not equal.""" loc1 = FeatureLocation(22, 42, 1) loc2 = FeatureLocation(23, 42, 1) self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(23, 42, 1) loc2 = FeatureLocation(23, 43, 1) self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(23, 42, 1) loc2 = FeatureLocation(23, 42, -1) self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(23, 42, 1) loc2 = (23, 42, 1) self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(23, 42, 1, "foo") loc2 = FeatureLocation(23, 42, 1, "bar") self.assertNotEqual(loc1, loc2) loc1 = FeatureLocation(23, 42, 1, "foo", "bar") loc2 = FeatureLocation(23, 42, 1, "foo", "baz") self.assertNotEqual(loc1, loc2)
def setUp(self): self.features = [ FakeFeature('cluster', FeatureLocation(25, 50), {'note': ['Cluster number: 1']}), FakeFeature('CDS', FeatureLocation(15, 20)), FakeFeature('PFAM_domain', FeatureLocation(15, 17)), FakeFeature('CDS', FeatureLocation(23, 42)), FakeFeature('CDS', FeatureLocation(45, 47)), FakeFeature('CDS', FeatureLocation(48, 55)), FakeFeature('aSDomain', FeatureLocation(4730, 4740)), FakeFeature('CDS', FeatureLocation(4700, 4710)), FakeFeature('CDS', FeatureLocation(4750, 4760)), FakeFeature('CDS', FeatureLocation(4790, 4812)), FakeFeature('cluster', FeatureLocation(4711, 4800), {'note': ['Cluster number: 2']}), ] self.record = FakeRecord(self.features)
def find_repeat(self, contig, fn, st, ppno, extra_dna): """ Find repeats in the DNA sequence :param self: The data object :param contig: the name of the contig we are searching on :param fn: the nuclotide sequence to search :param st: the start to find repeats at :param ppno: the prophage number :param extra_dna: the extra dna that flanks the sequence :return: a list of repeat regions """ if len(fn) == 0: log_and_message("Len sequence is 0 so ignoring\n", c="RED", stderr=True, loglevel="WARNING") return {} rep = {} index = 0 # with open(os.path.join(output_dir, "repeat_finding"), 'a') as rptout: # rptout.write(f">pp{ppno} {st}\n{fn}\n") try: # set the False parameter to True to enable debugging of repeat finder repeats = PhiSpyRepeatFinder.repeatFinder(fn, 3, self.min_repeat_len, ppno, False) except Exception as e: log_and_message( f"There was an error running repeatfinder for {fn}:{e}\n", c="RED", stderr=True, loglevel="WARNING") return {} for r in repeats: if (r['first_start'] < (3 * extra_dna)) and (r['second_start'] > (len(fn) - (3 * extra_dna))): # check that start is always less than end # This always causes an off by one error, so we have to increment our ends if r['first_end'] < r['first_start']: [r['first_start'], r['first_end']] = [r['first_end'] + 1, r['first_start'] + 1] if r['second_end'] < r['second_start']: [r['second_start'], r['second_end'] ] = [r['second_end'] + 1, r['second_start'] + 1] rep[index] = {} rep[index]['s1'] = r['first_start'] + st rep[index]['e1'] = r['first_end'] + st rep[index]['s2'] = r['second_start'] + st rep[index]['e2'] = r['second_end'] + st if self.include_all_repeats: replen = max(rep[index]['e1'] - rep[index]['s1'], rep[index]['e2'] - rep[index]['s2']) r1loc = FeatureLocation(rep[index]['s1'], rep[index]['e1'], strand=+1) r2loc = FeatureLocation(rep[index]['s2'], rep[index]['e2'], strand=+1) rptloc = CompoundLocation([r1loc, r2loc]) rptsf = SeqFeature( rptloc, type="repeat_region", qualifiers={ 'note': f"{replen}bp repeat identified by PhiSpy v{version.__version__}" }) self.record.get_entry(contig).features.append(rptsf) index += 1 return rep
start = ali_from - (enter.max_length - hmm_to) else: start = ali_from elif enter.length is not False and \ enter.min_length is not False: if hmm_from > 1: end = (hmm_from - 1) + ali_to else: end = ali_to if enter.min_length < hmm_to < enter.max_length: start = ali_from - (enter.max_length - hmm_to) elif hmm_to <= enter.min_length: start = ali_from - (enter.min_length - hmm_to) start_pos = SeqFeature.ExactPosition(start - 1) end_pos = SeqFeature.ExactPosition(end) feature_location = FeatureLocation(start_pos, end_pos) feature_type = enter.feature from Bio.SeqFeature import SeqFeature note_qualifier = dict() note_qualifier['note'] = str( '%s score %s E-value %s' % (prog[2].replace('\n', ''), score, e_value)) my_feature = MySeqFeature(location=feature_location, type=feature_type, strand=strnd, qualifiers=dict( list(qualifier.items()) + list(note_qualifier.items()))) if (hmm_diff - ali_diff == 0 or hmm_diff - ali_diff == 1 or
## Generate list of contig,length tuples then sort in decending order genome = {} entries = [] lengths = [] for i in SeqIO.parse("/Volumes/MP_HD/repDNA_data/Pm_all_genome.fasta", "fasta"): genome[i.id] = i entries.append((i.id,len(i.seq))) lengths.append(len(i.seq)) entries = sorted(entries,key=lambda x: x[1],reverse=True) ## Create gene SeqFeatures as subsets of contig seqRecord objects for i in genome: for j in gene: if gene[j][0] == genome[i].id: direc = int(gene[j][3] + '1') genome[i].features.append(SeqFeature(FeatureLocation(gene[j][1], gene[j][2], strand = direc),type='gene',id=j,qualifiers={'locus_tag':[gene[j][4]]})) for j in centromeres: if centromeres[j][0] == genome[i].id: # print j,gene[j][0],gene[j][3],genome[i].id # sl(0.5) direc = None#int(gene[j][3] + '1') genome[i].features.append(SeqFeature(FeatureLocation(centromeres[j][1], centromeres[j][2], strand = direc),type='gene',id=j,qualifiers={'locus_tag':[centromeres[j][4]]})) ## telomere length - rounded ends of chromosome size max_len = max(lengths) telomere_length = 40000 chr_diagram = BasicChromosome.Organism() chr_diagram.page_size = (60*cm, 21*cm)
def to_seqfeature(self): feat = SeqFeature(location=FeatureLocation(self.start, self.end), id=self.value) if hasattr(self, 'confidence'): feat.qualifiers['confidence'] = self.confidence return feat
def write_all_outputs(**kwargs): self = Namespace(**kwargs) # make all predicted pp list log_and_message("Creating output files", c="GREEN", stderr=True, quiet=self.quiet) prophage_feature_type = 'misc_feature' # / prophage_region for i in self.pp: self.record.get_entry(self.pp[i]['contig']).append_feature( SeqFeature( location=FeatureLocation(self.pp[i]['start'] - 1, self.pp[i]['stop'] - 1), type=prophage_feature_type, strand=1, qualifiers=OrderedDict({ 'note': f'prophage region pp{i} identified with PhiSpy v{version.__version__}' }))) if 'atts' in self.pp[i]: self.record.get_entry(self.pp[i]['contig']).append_feature( SeqFeature( location=FeatureLocation(int(self.pp[i]['att'][0]), int(self.pp[i]['att'][1])) + FeatureLocation(int(self.pp[i]['att'][2]), int(self.pp[i]['att'][3])), type='repeat_region', strand=1, qualifiers=OrderedDict({ 'note': f'prophage region pp{i} potential attachment sites' }))) if self.keep_dropped_predictions: for i in self.droppedpp: self.record.get_entry(self.droppedpp[i]['contig']).append_feature( SeqFeature( location=FeatureLocation(self.droppedpp[i]['start'] - 1, self.droppedpp[i]['stop'] - 1), type=prophage_feature_type, strand=-1, qualifiers=OrderedDict({ 'note': f'Putative prophage region identified with PhiSpy v{version.__version__} but not' + f'kept because: {self.droppedpp[i]["dropped_reason"]}' }))) """ now we need to decide which files to keep It is based on this code: Code | File --- | --- 1 | prophage_coordinates.tsv 2 | GenBank format output 4 | prophage and bacterial sequences 8 | prophage_information.tsv 16 | prophage.tsv 32 | GFF3 format of the prophages 64 | prophage.tbl 128 | test data used in the random forest 256 | GFF3 format of the genomes As explained in the README. """ oc = self.output_choice if oc >= 256: # write the genomic GFF3 format genome_gff3(self) oc -= 256 if oc >= 128: # write the calculated data write_test_data(self) oc -= 128 if oc >= 64: # write the prophage location table write_prophage_tbl(self) oc -= 64 if oc >= 32: # write the prophage in GFF3 format write_gff3(self) oc -= 32 if oc >= 16: # write a tsv file of this data write_prophage_tsv(self) oc -= 16 if oc >= 8: # write prophage_information.tsv write_prophage_information(self) oc -= 8 if oc >= 4: # separate out the bacteria and phage as fasta files write_phage_and_bact(self) oc -= 4 if oc >= 2: # update input GenBank file and incorporate prophage regions write_genbank(self) oc -= 2 if oc >= 1: # print the prophage coordinates: write_prophage_coordinates(self)
def scan_orfs(seq: str, direction: int, offset: int = 0) -> List[FeatureLocation]: """ Scan for open reading frames on a given sequence. Skips all ORFs with a size less than 60 bases. Arguments: seq: the sequence to examine direction: the search direction to use (all ORFs will use this as the strand) offset: an offset to add to any location discovered Returns: a list of FeatureLocations for each ORF, ordered by ascending position """ seq = seq.upper() start_codons = ('ATG', 'GTG', 'TTG') stop_codons = ('TAA', 'TAG', 'TGA') matches = [] # cache the sequence length seq_len = len(seq) for frame in [0, 1, 2]: i = frame last_stop = 0 while i < seq_len - 2: if seq[i:i + 3] in stop_codons and last_stop == 0: # special case for unstarted stops last_stop = i new_orf = FeatureLocation(BeforePosition(offset), offset + i + 2 + 1, direction) if direction == -1: start = AfterPosition(seq_len + offset - new_orf.start) end = seq_len + offset - new_orf.end new_orf = FeatureLocation(end, start, strand=direction) matches.append(new_orf) if seq[i:i + 3] not in start_codons: i += 3 continue # Look for the next stop codon in this frame for j in range(i, seq_len - 2, 3): if seq[j:j + 3] in stop_codons: last_stop = j # Skip Orfs that are shorter than 20 AA / 60 bases if j - i <= 60: break # since no ORFs will be bigger before the stop start = i end = j + 2 + 1 if direction == 1: new_orf = FeatureLocation(offset + start, offset + end, direction) else: # reversed, so convert back to the forward positions new_orf = FeatureLocation(seq_len + offset - end, seq_len + offset - start, direction) matches.append(new_orf) # This was a good hit, update the last_stop cache. break # if we found a matching stop, carry on looking for starts after this stop if last_stop > i: i = last_stop continue # Save orfs ending at the end of the sequence without stop codon if direction == 1: new_orf = FeatureLocation(i + offset, AfterPosition(seq_len + offset), direction) else: # reversed, so convert back to the forward positions new_orf = FeatureLocation(BeforePosition(offset), offset + seq_len - i, direction) matches.append(new_orf) # since there are no stop codons, just stop here break return sorted(matches, key=lambda x: min(x.start, x.end))
def check_genomewide(refseq, VERBOSE=0): '''Check the integrity of all genes in the genomewide consensus''' # Check single-exon genes length_tolerance = {'gag': 30, 'pol': 30, 'env': 70, 'vpr': 15, 'vpu': 15} for genename, tol in length_tolerance.iteritems(): (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: # sometimes the gene ends a few nucleotides upstream, and there is a # frameshift mutation that screws up gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] else: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if (not check): if genename != 'vpu': return False else: print 'ERROR IN VPU STARTING CODON, CONTINUING!' check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: # sometimes a gene is a bit longer gene_new = refseq.seq[start:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') end_diff = start + (3 * end_new + 3) - end if -90 < end_diff < 0: print genename.upper()+' ENDS '+str((end - start) // 3 - end_new - 1)+' AMINO ACIDS UPSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() elif 0 < end_diff < 90: print genename.upper()+' ENDS '+str(end_new + 1 - (end - start) // 3)+' AMINO ACIDS DOWNSTREAM!' gene = gene_new[:3 * (end_new + 1)] prot = gene.translate() else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Vif is special because it can be longer than in HXB2 genename = 'vif' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=0) if not check: # Vif tends to be a bit longer than in HXB2 for nc in xrange(1, 4): gene_ext = refseq[start: end + 3 * nc].seq prot_ext = gene_ext.translate() check = check_has_end(prot_ext, genename, VERBOSE=0) if check: gene = gene_ext prot = prot_ext if VERBOSE: print 'WARNING: '+genename+' actually ends '+str(nc)+' codons downstream' break else: print 'ERROR: '+genename+' does not end, not even slightly downstream' return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False # Check 2-exon genes for genename_whole in ('tat', 'rev'): genename = genename_whole+'1' (start, end, start_found, end_found) = locate_gene(refseq, genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=15) if not check: return False geneseq = refseq[start: end] geneseq = geneseq[:len(geneseq) - len(geneseq) % 3] gene = geneseq.seq prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon1 = start end_exon1 = end genename = genename_whole+'2' (start, end, start_found, end_found) = locate_gene(refseq[end_exon1 + 2000:], genename, VERBOSE=VERBOSE) if (not start_found) or (not end_found): print 'ERROR: '+genename+' not found in genomewide!' return False elif VERBOSE >= 3: print 'OK: start and end of '+genename+' found' start += end_exon1 + 2000 end += end_exon1 + 2000 # NOTE: rev2 overlaps with env gp41 and can have insertions or deletions if genename == 'rev2': tol = 45 else: tol = 15 gene_HXB2 = get_gene_HXB2(genename) check = check_has_similar_length(end - start, len(gene_HXB2), genename, VERBOSE=VERBOSE, maxdiff=tol) if not check: return False geneseq = refseq[start: end] frame = get_frame(geneseq, gene_HXB2, genename, VERBOSE=VERBOSE) geneseq = geneseq[frame:] gene = geneseq.seq prot = gene.translate() check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: if genename != 'rev2': return False else: # rev2 can end a bit early end_new = prot.rfind('*') if end_new != -1: if len(prot) - 1 - end_new < 20: print 'REV2 ENDS '+str(len(prot) - end_new - 1)+' AMINO ACIDS UPSTREAM!' prot = prot[:end_new + 1] end = start + frame + 3 * (end_new + 1) else: return False else: # rev2 can also end quite a bit late gene_new = refseq.seq[start:] gene_new = gene_new[(end - start) % 3:] gene_new = gene_new[:len(gene_new) - (len(gene_new) % 3)] prot_new = gene_new.translate() end_new = prot_new.find('*') if (start + 3 * end_new) - end < 200: print 'REV2 ENDS '+str(end_new - len(prot) + 1)+' AMINO ACIDS DOWNSTREAM!' prot = prot_new[:end_new + 1] end = start + ((end - start) % 3) + 3 * (end_new + 1) else: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False start_exon2 = start end_exon2 = end genename = genename_whole gene_HXB2 = get_gene_HXB2(genename) from Bio.SeqFeature import FeatureLocation gene_loc = FeatureLocation(start_exon1, end_exon1, strand=+1) + \ FeatureLocation(start_exon2, end_exon2, strand=+1) geneseq = gene_loc.extract(refseq) gene = geneseq.seq check = check_has_complete_codons(gene, genename, VERBOSE=VERBOSE) if not check: return False prot = gene.translate() check = check_start_aminoacid(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_end(prot, genename, VERBOSE=VERBOSE) if not check: return False check = check_has_premature_stops(prot, genename, VERBOSE=VERBOSE) if not check: return False return True
def kbaseGenomeToGenbank(genome_object, taxid=None): '''Convert a KBase genome object into a Genbank file incorporating as much info as we can as found in the NCBI genbank files. Note - the genome object (not to be confused with a ModelSEED "annotation" object) has both annotations / translations AND the DNA sequence. It's obtained by calling annotate_genome on an object that only has the DNA sequence. Hopefully they won't change this otherwise I'll have to do more cross-referencing and ask for two files. Sigh...''' organism_name = genome_object["scientific_name"] organism_domain = genome_object["domain"] organism_id = genome_object["id"] organism_genetic_code = genome_object["genetic_code"] # Get the TaxID # If none is specified the user has to provide one (or at least some unique integer, not necessarily a tax ID) for this system to work right. if taxid is None: # CDMI.py is from the KBase - we need it to get the Taxon ID # Download it at http://kbase.science.energy.gov/developer-zone/downloads/ try: from CDMI import CDMI_EntityAPI except ImportError: sys.stderr.write( "ERROR: If no TaxID is provided, the CDMI.py file is necessary (http://kbase.science.energy.gov/developer-zone/downloads/) to attempt to guess it.\n" ) exit(2) URL = "https://www.kbase.us/services/cdmi_api/" cdmi_entity = CDMI_EntityAPI(URL) reldict = cdmi_entity.get_relationship_IsInTaxa( organism_id, [], [], ["id"]) if reldict is None: sys.stderr.write( "ERROR: TaxID for Organism ID %s not found in the KBase CDMI. You will need to specify it manually if you want it\n" % (organism_id)) exit(2) else: taxidlist = getFieldFromRelationship(reldict, "id", "to") taxid = taxidlist[0] annotations = {'source': organism_name, 'organism': organism_name} # Specify contig data and "source" features for each contig (required by the genbank standard) contig_to_sequence = {} contig_to_feature_data = {} for contig in genome_object["contigs"]: contig_to_sequence[contig["id"]] = contig["dna"] qualifiers = {} qualifiers["organism"] = organism_name qualifiers["mol_type"] = "Genomic DNA" if taxid is not None: qualifiers["db_xref"] = "taxon:%s" % (taxid) feature = SeqFeature(FeatureLocation(0, len(contig["dna"])), strand=1, type="source", qualifiers=qualifiers) contig_to_feature_data[contig["id"]] = [feature] # The contig references are inside the feature definitions in the Genome object file, but # in a genbank file the features in a contig must all be separated. # Therefore I have to keep track of them in one step and then create the SeqRecord objects # in a separate step. for feature in genome_object["features"]: # FIXME - What do I do with things that have more than one location? assert (len(feature["location"]) == 1) # First lets Deal with start and stop locations... # I verified against Pubseed that these semantics and calcualtions are correct, at least # for the proteins I checked that are the same between pubseed and KBase... loc = feature["location"][0] contig = loc[0] start = int(loc[1]) strandstr = loc[2] if strandstr == "-": strand = -1 else: strand = 1 featurelen = loc[3] if strand == -1: stop = start - featurelen + 1 else: stop = start + featurelen - 1 # Now I need to convert these into Python slicing indexes...because that is what FeatureLocation wants. # This includes making the start always less than stop and offsetting the stop by 1 because slide [a,b] only goes up to position b-1 seqstart = min(start, stop) - 1 seqstop = max(start, stop) feature_id = feature["id"] feature_type = feature["type"] qualifiers = {} # Unfortunately there are features including proteins in the genome objects that have no function (not even "hypothetical protein") # Thankfully this isn't a required field in the Genbank file if "function" in feature: qualifiers["product"] = strip_control_characters( feature["function"]) if feature_type == "CDS" or feature_type == "peg": qualifiers["protein_id"] = feature_id qualifiers["translation"] = feature["protein_translation"] # "RNA" is not an official type in a GENBANK file. # We attempt to figure out based on the annotation whether it is a tRNA, rRNA, or other (misc_RNA) RNA. # These are the offiial RNA types (aside from mRNA but those don't have special fields in the Genome object) if feature_type == "rna": rRNA_finders = [ "rRNA", "ribosomal", "5S", "16S", "23S", "5.8S", "28S", "18S" ] tRNA_finders = ["tRNA", "transfer"] for finder in rRNA_finders: if finder in feature["function"]: feature_type = "rRNA" for finder in tRNA_finders: if finder in feature["function"]: feature_type = "tRNA" if feature_type == "rna": feature_type = "misc_RNA" # I checked that the above formulas give the correct positions in the genbank file (or at least, the same as the PubSEED genabnk files). feature = SeqFeature(FeatureLocation(seqstart, seqstop), strand=strand, type=feature_type, id=feature_id, qualifiers=qualifiers) # Attach the new features to the appropriate contig... if contig in contig_to_feature_data: contig_to_feature_data[contig].append(feature) else: contig_to_feature_data[contig] = [feature] # Create one record for each contig records = [] for contig in contig_to_feature_data: seq = Seq(contig_to_sequence[contig], IUPAC.ambiguous_dna) record = SeqRecord(seq, id=sanitizeString(contig, False), description="%s contig %s" % (organism_name, contig), name=contig, features=contig_to_feature_data[contig], annotations=annotations) records.append(record) SeqIO.write(records, sys.stdout, "genbank") return
def write_data_to_seq_record(pksnrpsvars, seq_record, options): #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record # # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers for f in utils.get_cluster_features(seq_record): cluster_info = f.qualifiers for feature in pksnrpsvars.pksnrpscoregenes: nrat = 0 nra = 0 nrcal = 0 nrkr = 0 nrXdom = 0 secmetqualifiers = feature.qualifiers['sec_met'] updated_secmetqualifiers = [] updated_secmetqualifiers_predictions = [] domainFeatures = [] gene_id = utils.get_gene_id(feature) for qualifier in secmetqualifiers: if "NRPS/PKS Domain:" not in qualifier: updated_secmetqualifiers.append(qualifier) updated_secmetqualifiers_predictions.append(qualifier) else: # extract domain type, start and end position from qualifier string match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier) if not match_pos_obj: logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier) sys.exit(1) domain_type = match_pos_obj.group(1) start_aa = int(match_pos_obj.group(2)) end_aa = int(match_pos_obj.group(3)) evalue = float(match_pos_obj.group(4)) score = float (match_pos_obj.group(5)) #calculate respective positions based on aa coordinates if feature.location.strand==1: start = feature.location.start + ( 3 * start_aa ) end = feature.location.start + ( 3* end_aa ) else: end = feature.location.end - ( 3 * start_aa ) start = feature.location.end - ( 3 * end_aa) loc = FeatureLocation(start, end, strand=feature.strand) # set up new CDS_motif feature domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag) domainFeature.qualifiers['domain'] = [domain_type] if feature.qualifiers.has_key('locus_tag'): domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag'] else: domainFeature.qualifiers['locus_tag'] = [gene_id] domainFeature.qualifiers['detection'] = ["hmmscan"] domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"] domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))] domainFeature.qualifiers['score'] = [score] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))] domainFeature_specificity = [] if domain_type == "AMP-binding": nra += 1 domainname = gene_id + "_A" + str(nra) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] predicat_prediction = "{}-{}".format(pksnrpsvars.sandpuma_allmethods[domainname]['prediCAT_MP'], resolve_predicat_domain_specificity(pksnrpsvars.sandpuma_allmethods[domainname]['prediCAT_MP'])) domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.sandpuma_allmethods[domainname]['ASM']) domainFeature_specificity.append("NRPSpredictor3 SVM: %s" % pksnrpsvars.sandpuma_allmethods[domainname]['SVM']) domainFeature_specificity.append("pHMM: %s" % pksnrpsvars.sandpuma_allmethods[domainname]['pHMM']) domainFeature_specificity.append("PrediCAT %s" % predicat_prediction) domainFeature_specificity.append("SANDPUMA ensemble: %s" % pksnrpsvars.sandpuma_res[domainname]) domainFeature_specificity.append("PID to NN: %s" % pksnrpsvars.sandpuma_pid[domainname]) domainFeature_specificity.append("SNN score: %s" % pksnrpsvars.sandpuma_snn[domainname]) newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: "\ "%s (Stachelhaus code), %s (NRPSPredictor3 SVM), %s (pHMM), "\ "%s (PrediCAT), %s (SANDPUMA ensemble); PID to NN: %s; "\ "SNN score: %s ." % \ (domainname, pksnrpsvars.sandpuma_allmethods[domainname]['ASM'], pksnrpsvars.sandpuma_allmethods[domainname]['SVM'], pksnrpsvars.sandpuma_allmethods[domainname]['pHMM'], predicat_prediction, pksnrpsvars.sandpuma_res[domainname], pksnrpsvars.sandpuma_pid[domainname], pksnrpsvars.sandpuma_snn[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor3 SVM), %s (Stachelhaus code), %s (PrediCAT), %s (SANPDPUMA);" % \ (domainname, pksnrpsvars.sandpuma_allmethods[domainname]['SVM'], pksnrpsvars.sandpuma_allmethods[domainname]['ASM'], predicat_prediction, pksnrpsvars.sandpuma_res[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_AT": nrat += 1 domainname = gene_id + "_AT" + str(nrat) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname]) domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname]) #For t1pks, t2pks and t3pks if 'transatpks' not in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) #For transatpks elif 'transatpks' in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "CAL_domain": nrcal += 1 domainname = gene_id + "_CAL" + str(nrcal) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_KR": nrkr += 1 domainname = gene_id + "_KR" + str(nrkr) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname]) domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname]) newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) else: nrXdom += 1 domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)] updated_secmetqualifiers.append(qualifier) domainFeature.qualifiers['specificity'] = domainFeature_specificity if _map_domaintype(domain_type): domainFeature.qualifiers['domain_subtype'] = [domain_type] domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)] domainFeatures.append(domainFeature) feature.qualifiers['sec_met'] = updated_secmetqualifiers # BiosynML: creating new 'sec_met_predictions' qualifier seq_record.features.extend(domainFeatures) if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id): feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id]) #Save consensus structure + link to structure image to seq_record clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) if pksnrpsvars.compound_pred_dict.has_key(clusternr): structpred = pksnrpsvars.compound_pred_dict[clusternr] cluster.qualifiers['note'].append("Monomers prediction: " + structpred) cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
for sequence in sequences: seq = Seq(sequence, IUPAC.unambiguous_dna) # unambiguous_dna = only ACGT seqRecord = SeqRecord( seq, id='123', # sequence ID name='EXAMPLE', # Unique name (no blank spaces) description='Annotation for the sequence', # anotacao da enzima annotations={ "molecule_type": "cDNA", "date": "25-MAR-2019" }) qualifiers = { "EC_number": "9.9.9.9", # enzyme EC "info": "value" # other data } feature = SeqFeature(FeatureLocation(start=0, end=len(sequence)), type='cDNA', location_operator='', strand=0, id=None, qualifiers=qualifiers, sub_features=None, ref=None, ref_db=None) seqRecord.features.append(feature) listOfseqrecords.append(seqRecord) #Open the output file to write outputFile = open('example.gb', 'w') for seqRecord in listOfseqrecords: SeqIO.write(seqRecord, outputFile, 'genbank')
def run_module(self): if self.id_list and os.access(self.id_list[0], os.R_OK): print("Detected supplied circRNA ID file.") with open(self.id_list[0]) as f: lines = f.read().splitlines() self.id_list = lines # let's first check if the temporary directory exists if not (os.access(self.temp_dir, os.W_OK)): print("Temporary directory %s not writable." % self.temp_dir) # exit with -1 error if we can't use it exit(-1) # let's first check if the temporary directory exists if not (os.access(self.output_dir, os.W_OK)): print("Output directory %s not writable." % self.output_dir) # exit with -1 error if we can't use it exit(-1) # let's first check if the temporary directory exists if self.product_range and len(self.product_range) != 2: print( "Please specify a qPCR product range as range, e.g. \"-p 140 150\"." ) # exit with -1 error if we can't use it exit(-1) if self.product_range[1] < self.product_range[0]: print("qPCR product range has to be > 0.") # exit with -1 error if we can't use it exit(-1) circ_rna_number = 0 # define temporary files exon_storage_tmp = self.temp_dir + "circtools_flanking_exons.tmp" blast_storage_tmp = self.temp_dir + "circtools_blast_results.tmp" blast_xml_tmp = self.temp_dir + "circtools_blast_results.xml" output_html_file = self.output_dir + self.experiment_title.replace( " ", "_") + ".html" # erase old contents open(exon_storage_tmp, 'w').close() # define cache dicts exon_cache = {} flanking_exon_cache = {} primer_to_circ_cache = {} if self.input_circRNA: from Bio import SeqIO with open(exon_storage_tmp, 'a') as data_store: for record in SeqIO.parse(self.input_circRNA, "fasta"): # from the FASTA file we cannot tell the coordinates of the circRNA name = str(record.id) + "_0_0_" + str(len( record.seq)) + "_0" data_store.write("\t".join( [name, str(record.seq), "", "\n"])) exon_cache[name] = {1: str(record.seq), 2: ""} else: exons = self.read_annotation_file(self.gtf_file, entity="exon") with open(self.dcc_file) as fp: for line in fp: # make sure we remove the header if line.startswith('Chr\t'): continue line = line.rstrip() current_line = line.split('\t') if current_line[3] == "not_annotated": continue if self.gene_list and not self.id_list and current_line[ 3] not in self.gene_list: continue sep = "_" name = sep.join([ current_line[3], current_line[0], current_line[1], current_line[2], current_line[5] ]) if self.id_list and not self.gene_list and name not in self.id_list: continue circrna_length = int(current_line[2]) - int( current_line[1]) if self.product_range[ 0] > circrna_length or self.product_range[ 1] > circrna_length: print( "Specified qPCR product size to large for circRNA \"%s\".\nCircRNA length:" " %d, product size: %d to %d." % (name, circrna_length, self.product_range[0], self.product_range[1])) exit(-1) flanking_exon_cache[name] = {} sep = "\t" bed_string = sep.join([ current_line[0], current_line[1], current_line[2], current_line[3], str(0), current_line[5] ]) virtual_bed_file = pybedtools.BedTool(bed_string, from_string=True) result = exons.intersect(virtual_bed_file, s=True) fasta_bed_line_start = "" fasta_bed_line_stop = "" start = 0 stop = 0 for result_line in str(result).splitlines(): bed_feature = result_line.split('\t') # this is a single-exon circRNA if bed_feature[1] == current_line[1] and bed_feature[ 2] == current_line[2]: fasta_bed_line_start += result_line + "\n" start = 1 stop = 1 if bed_feature[1] == current_line[1] and start == 0: fasta_bed_line_start += result_line + "\n" start = 1 if bed_feature[2] == current_line[2] and stop == 0: fasta_bed_line_stop += result_line + "\n" stop = 1 # these exons are kept for correctly drawing the circRNAs later # not used for primer design if bed_feature[1] > current_line[1] and bed_feature[ 2] < current_line[2]: flanking_exon_cache[name][bed_feature[1] + "_" + bed_feature[2]] = 1 virtual_bed_file_start = pybedtools.BedTool( fasta_bed_line_start, from_string=True) virtual_bed_file_stop = pybedtools.BedTool( fasta_bed_line_stop, from_string=True) virtual_bed_file_start = virtual_bed_file_start.sequence( fi=self.fasta_file) virtual_bed_file_stop = virtual_bed_file_stop.sequence( fi=self.fasta_file) if stop == 0 or start == 0: print( "Could not identify the exact exon-border of the circRNA." ) print( "Will continue with non-annotated, manually extracted sequence." ) # we have to manually reset the start position fasta_bed_line = "\t".join([ current_line[0], current_line[1], current_line[2], current_line[5] ]) virtual_bed_file_start = pybedtools.BedTool( fasta_bed_line, from_string=True) virtual_bed_file_start = virtual_bed_file_start.sequence( fi=self.fasta_file) virtual_bed_file_stop = "" exon1 = "" exon2 = "" if virtual_bed_file_start: exon1 = open( virtual_bed_file_start.seqfn).read().split( "\n", 1)[1].rstrip() if virtual_bed_file_stop: exon2 = open(virtual_bed_file_stop.seqfn).read().split( "\n", 1)[1].rstrip() circ_rna_number += 1 print("extracting flanking exons for circRNA #", circ_rna_number, name, end="\n", flush=True) if exon2 and not exon1: exon1 = exon2 exon2 = "" exon_cache[name] = {1: exon1, 2: exon2} with open(exon_storage_tmp, 'a') as data_store: data_store.write("\t".join([name, exon1, exon2, "\n"])) if not exon_cache: print( "Could not find any circRNAs matching your criteria, exiting.") exit(-1) # need to define path top R wrapper primer_script = 'circtools_primex_wrapper.R' # ------------------------------------ run script and check output ----------------------- script_result = os.popen(primer_script + " " + exon_storage_tmp + " " + str(self.product_range[0]) + "," + str(self.product_range[1]) + " " + self.junction + " " + str(self.num_pairs)).read() # this is the first time we look through the input file # we collect the primer sequences and unify everything in one blast query blast_object_cache = {} blast_result_cache = {} blast_input_file = "" if circ_rna_number < 50: for line in script_result.splitlines(): entry = line.split('\t') circular_rna_id = entry[0].split('_') if entry[1] == "NA": continue # only blast 1 elif entry[2] in blast_object_cache and not entry[ 1] in blast_object_cache: blast_input_file += "\n>" + entry[1] + "\n" + entry[1] blast_object_cache[entry[1]] = 1 primer_to_circ_cache[entry[1]] = circular_rna_id[0] # only blast 2 elif entry[1] in blast_object_cache and not entry[ 2] in blast_object_cache: blast_input_file += "\n>" + entry[2] + "\n" + entry[2] blast_object_cache[entry[2]] = 1 primer_to_circ_cache[entry[2]] = circular_rna_id[0] # seen both already, skip elif entry[1] in blast_object_cache and entry[ 2] in blast_object_cache: continue # nothing seen yet, blast both else: blast_input_file += "\n>" + entry[1] + "\n" + entry[ 1] + "\n>" + entry[2] + "\n" + entry[2] blast_object_cache[entry[1]] = 1 blast_object_cache[entry[2]] = 1 primer_to_circ_cache[entry[1]] = circular_rna_id[0] primer_to_circ_cache[entry[2]] = circular_rna_id[0] else: print("Too many circRNAs selected, skipping BLAST step.") if self.no_blast: print("User disabled BLAST search, skipping.") run_blast = 0 # check if we have to blast if not self.no_blast and blast_input_file: try: print("Sending " + str(len(blast_object_cache)) + " primers to BLAST") print("This may take a few minutes, please be patient.") result_handle = self.call_blast(blast_input_file, self.organism) run_blast = 1 except Exception as exc: print(exc) exit(-1) with open(blast_xml_tmp, "w") as out_handle: out_handle.write(result_handle.read()) result_handle.close() result_handle = open(blast_xml_tmp) blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: if blast_record.query not in blast_result_cache: blast_result_cache[blast_record.query] = [] for description in blast_record.descriptions: # filter out the host gene we're in now # also filter out all "PREDICTED" stuff if description.title.find(primer_to_circ_cache[blast_record.query]) == -1 and\ description.title.find("PREDICTED") == -1: blast_result_cache[blast_record.query].append( description.title) # if we encounter NAs nothing has been blasted, we manually set the values now blast_result_cache["NA"] = ["Not blasted, no primer pair found"] primex_data_with_blast_results = "" for line in script_result.splitlines(): entry = line.split('\t') # split up the identifier for final plotting line = line.replace("_", "\t") if run_blast == 1: left_result = "No hits" right_result = "No hits" else: left_result = "Not blasted, no primer pair found" right_result = left_result if entry[1] in blast_result_cache: left_result = ";".join(blast_result_cache[entry[1]]) if entry[2] in blast_result_cache: right_result = ";".join(blast_result_cache[entry[2]]) # update line primex_data_with_blast_results += line + "\t" + left_result + "\t" + right_result + "\n" with open(blast_storage_tmp, 'w') as data_store: data_store.write(primex_data_with_blast_results) # need to define path top R wrapper primer_script = 'circtools_primex_formatter.R' # ------------------------------------ run script and check output ----------------------- primex_data_formatted = os.popen(primer_script + " " + blast_storage_tmp + " " + "\"" + self.experiment_title + "\"").read() with open(output_html_file, 'w') as data_store: data_store.write(primex_data_formatted) print("Writing results to " + output_html_file) # here we create the circular graphics for primer visualisation for line in primex_data_with_blast_results.splitlines(): entry = line.split('\t') # no primers, no graphics if entry[6] == "NA": continue circular_rna_id = "_".join( [entry[0], entry[1], entry[2], entry[3], entry[4]]) if circular_rna_id in exon_cache: circular_rna_id_isoform = circular_rna_id + "_" + entry[5] circrna_length = int(entry[3]) - int(entry[2]) exon1_length = len(exon_cache[circular_rna_id][1]) exon2_length = len(exon_cache[circular_rna_id][2]) exon2_colour = "#ffac68" if exon2_length == 0: exon1_length = int( len(exon_cache[circular_rna_id][1]) / 2) + 1 exon2_length = int(len(exon_cache[circular_rna_id][1]) / 2) exon2_colour = "#ff6877" forward_primer_start = int( entry[8].split(',')[0]) + circrna_length - exon2_length forward_primer_length = int(entry[8].split(',')[1]) reverse_primer_start = int( entry[9].split(',')[0]) - exon2_length reverse_primer_length = int(entry[9].split(',')[1]) product_size = entry[14] gdd = GenomeDiagram.Diagram('circRNA primer diagram') gdt_features = gdd.new_track( 1, greytrack=True, name="", ) gds_features = gdt_features.new_set() feature = SeqFeature(FeatureLocation(0, exon1_length), strand=+1) gds_features.add_feature(feature, name="Exon 1", label=False, color="#ff6877", label_size=22) feature = SeqFeature(FeatureLocation( circrna_length - exon2_length, circrna_length), strand=+1) gds_features.add_feature(feature, name="Exon 2", label=False, color=exon2_colour, label_size=22) feature = SeqFeature(FeatureLocation(forward_primer_start, circrna_length), strand=-1) gds_features.add_feature(feature, name="Product", label=False, color="#6881ff") feature = SeqFeature(FeatureLocation(0, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Product: " + product_size + "bp", label=False, color="#6881ff", label_size=22, label_position="middle") if self.junction == "f": feature = SeqFeature(FeatureLocation( reverse_primer_start - reverse_primer_length, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) # the primer spans the BSJ, therefore we have to draw it in two pieces: # piece 1: primer start to circRNA end # piece 2: remaining primer portion beginning from 0 # piece 1: feature = SeqFeature( FeatureLocation(forward_primer_start, circrna_length)) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) # piece 2: feature = SeqFeature( FeatureLocation( 0, forward_primer_length - (circrna_length - forward_primer_start))) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) elif self.junction == "r": # the primer spans the BSJ, therefore we have to draw it in two pieces: # piece 1: primer start of circRNA to circRNA end # piece 2: remaining primer portion beginning from 0 # piece 1: feature = SeqFeature(FeatureLocation( circrna_length - reverse_primer_start, circrna_length), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) # piece 2: feature = SeqFeature(FeatureLocation( 0, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) feature = SeqFeature( FeatureLocation( forward_primer_start, forward_primer_start + forward_primer_length)) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) else: feature = SeqFeature(FeatureLocation( reverse_primer_start - reverse_primer_length, reverse_primer_start), strand=-1) gds_features.add_feature(feature, name="Reverse", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) feature = SeqFeature( FeatureLocation( forward_primer_start, forward_primer_start + forward_primer_length)) gds_features.add_feature(feature, name="Forward", label=False, sigil="BIGARROW", color="#75ff68", arrowshaft_height=0.3, arrowhead_length=0.1, label_size=22) feature = SeqFeature(FeatureLocation(0, 1)) gds_features.add_feature(feature, name="BSJ", label=True, color="white", label_size=22) if circular_rna_id in flanking_exon_cache: for exon in flanking_exon_cache[circular_rna_id]: exon_start, exon_stop = exon.split('_') exon_start = int(exon_start) - int(entry[2]) exon_stop = int(exon_stop) - int(entry[2]) feature = SeqFeature(FeatureLocation( exon_start, exon_stop), strand=+1) gds_features.add_feature(feature, name="Exon", label=False, color="grey", label_size=22) gdd.draw(format='circular', pagesize=(600, 600), circle_core=0.6, track_size=0.3, tracklines=0, x=0.00, y=0.00, start=0, end=circrna_length - 1) gdd.write( self.output_dir + "/" + circular_rna_id_isoform + ".svg", "svg")
def parse_gff(path): """Parses GFF and corresponding FASTA using GFFutils. Args: path (str): Path to GFF file. Should have a corresponding FASTA file of the same name with a valid FASTA suffix (.fa, .fasta, .fsa, .fna, .faa). Returns: list: SeqRecord objects corresponding to each scaffold in the file """ fasta = find_fasta(path) if not fasta: raise FileNotFoundError(f"Could not find partner FASTA file for {path}") # Parse FASTA and create GFFUtils database fasta = parse_fasta(fasta) gff = gffutils.create_db( str(path), ":memory:", force=True, merge_strategy="create_unique", sort_attribute_values=True ) regions = find_regions(gff.directives) # Find features for each record in the FASTA file for record in fasta: try: record_start, _ = regions[record.id] record_start -= 1 except KeyError: record_start = 0 # Normalise Feature location based on ##sequence-region directive. # Necessary for extracted GFF3 files that still store coordinates # relative to the entire region, not to the extracted FASTA. # If no sequence-region directive is found, assumes 1 (i.e. sequence start). cds_features = [] for feature in gff.region(seqid=record.id, featuretype=["gene", "CDS"]): feature = biopython_integration.to_seqfeature(feature) feature.location = FeatureLocation( feature.location.start - record_start, feature.location.end - record_start, strand=feature.location.strand ) if feature.type == "CDS": cds_features.append(feature) else: record.features.append(feature) if not cds_features: raise ValueError(f"Found no CDS features in {record.id} [{path}]") # Merge CDS features into singular SeqFeature objects, add them to record previous = None for feature in sorted(cds_features, key=lambda f: f.location.start): seqid = feature.qualifiers["ID"][0] same_feature = previous == seqid if not previous: previous = seqid if same_feature: if feature.location.strand == 1: record.features[-1].location += feature.location else: # Reverse strand locations must be in biological order old, new = record.features[-1].location, feature.location record.features[-1].location = new + old else: record.features.append(feature) previous = seqid # Sort, then generate insertion tuples like with other formats record.features.sort(key=lambda f: f.location.start) return fasta
def short_sigils(self, glyph): """Draw sigils on top of grey box backgrounds.""" # The blue boxes are only relevant for the BIGARROW # Add a track of features, bigger height to emphasise any sigil errors self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3) # We'll just use one feature set for these features, self.gds_features = self.gdt_features.new_set() # For the ARROW and BIGARROW sigils: # - Green arrows just have small heads (meaning if there is a mitre # it will escape the bounding box). # - Red arrows should be small triangles (so short no shaft shown) # Forward strand: feature = SeqFeature(FeatureLocation(15, 30), strand=-1) self.gds_features.add_feature(feature, color="blue") feature = SeqFeature(FeatureLocation(15, 30), strand=+1) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Forward", sigil=glyph, arrowhead_length=0.05) feature = SeqFeature(FeatureLocation(55, 60), strand=-1) self.gds_features.add_feature(feature, color="blue") feature = SeqFeature(FeatureLocation(55, 60), strand=+1) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Forward", sigil=glyph, arrowhead_length=1000, color="red") feature = SeqFeature(FeatureLocation(75, 125), strand=-1) self.gds_features.add_feature(feature, color="blue") feature = SeqFeature(FeatureLocation(75, 125), strand=+1) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Forward", sigil=glyph, arrowhead_length=0.05) # Strandless: feature = SeqFeature(FeatureLocation(140, 155), strand=None) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Strandless", sigil=glyph, arrowhead_length=0.05) feature = SeqFeature(FeatureLocation(180, 185), strand=None) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Strandless", sigil=glyph, arrowhead_length=1000, color="red") feature = SeqFeature(FeatureLocation(200, 250), strand=None) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Strandless", sigil=glyph, arrowhead_length=0.05) # Reverse strand: feature = SeqFeature(FeatureLocation(265, 280), strand=+1) self.gds_features.add_feature(feature, color="blue") feature = SeqFeature(FeatureLocation(265, 280), strand=-1) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Reverse", sigil=glyph, arrowhead_length=0.05) feature = SeqFeature(FeatureLocation(305, 310), strand=+1) self.gds_features.add_feature(feature, color="blue") feature = SeqFeature(FeatureLocation(305, 310), strand=-1) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Reverse", sigil=glyph, arrowhead_length=1000, color="red") feature = SeqFeature(FeatureLocation(325, 375), strand=+1) self.gds_features.add_feature(feature, color="blue") feature = SeqFeature(FeatureLocation(325, 375), strand=-1) self.gds_features.add_feature(feature, color="grey") self.gds_features.add_feature(feature, name="Reverse", sigil=glyph, arrowhead_length=0.05) self.finish("GD_sigil_short_%s" % glyph)
continue # checking each feature and separating them into different color sets to make reading the # genome map easier if len(gd_feature_set) % 2 == 0: color = colors.turquoise else: color = colors.lightgreen #setting each feature to the color assigned and adding a label gd_feature_set.add_feature(feature, color=color, label=True) # checking for specific genome sequences and labeling them with their specific labels for site, name, color in [("GAATTC", "EcoRI", colors.blue), ("GGATCC", "BamHI", colors.red)]: index = 0 while True: index = record.seq.find(site, start=index) if index == -1: break feature = SeqFeature(FeatureLocation(index, index + len(site))) gd_feature_set.add_feature(feature, color=color, name=name, label=True, label_size=10, label_color=color) index += len(site) # creating the circular genome map as a png using biopython's built in diagram creation tool gd_diagram.draw(format="circular", circular=True, pagesize=(20 * cm, 20 * cm), start=0, end=len(record), circle_core=0.5) gd_diagram.write("ToCSV.png", "PNG") # using Python's Image Library to add a title to the top of the file by opening the png, writing the name of the virus # and then saving over the file with the title added. img = Image.open("ToCSV.png") draw = ImageDraw.Draw(img) draw.text((200, 10), "Tomato Curly Stunt Virus", (0, 0, 0)) img.save("ToCSV.png")
def setUp(self): self.record = Record(Seq("A" * 1000)) self.subregion = SubRegion(FeatureLocation(100, 200), tool="test")