def test_genbank_consistency(path): """ Test whether the same annotation (if reasonable) can be read from a GFF3 file and a GenBank file. """ gb_file = gb.GenBankFile.read(join(data_dir("sequence"), path)) ref_annot = gb.get_annotation(gb_file) gff_file = gff.GFFFile.read(join(data_dir("sequence"), path[:-3] + ".gff3")) test_annot = gff.get_annotation(gff_file) # Remove qualifiers, since they will be different # in GFF3 and GenBank ref_annot = seq.Annotation( [seq.Feature(feature.key, feature.locs) for feature in ref_annot] ) test_annot = seq.Annotation( [seq.Feature(feature.key, feature.locs) for feature in test_annot] ) for feature in test_annot: # Only CDS, gene, intron and exon should be equal # in GenBank and GFF3 if feature.key in ["CDS", "gene", "intron", "exon"]: try: assert feature in test_annot except AssertionError: print(feature.key) for loc in feature.locs: print(loc) raise
def visualize_secondary_structure(sse, first_id): dssp_to_abc = { "I": "c", "S": "c", "H": "a", "E": "b", "G": "c", "B": "b", "T": "c", "C": "c" } for element in range(0, len(sse)): sse[element] = dssp_to_abc[sse[element]] def _add_sec_str(annotation, first, last, str_type): if str_type == "a": str_type = "helix" elif str_type == "b": str_type = "sheet" else: # coil return feature = seq.Feature("SecStr", [seq.Location(first, last)], {"sec_str_type": str_type}) annotation.add_feature(feature) # Find the intervals for each secondary structure element # and add to annotation annotation = seq.Annotation() curr_sse = None curr_start = None for i in range(len(sse)): if curr_start is None: curr_start = i curr_sse = sse[i] else: if sse[i] != sse[i - 1]: _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) curr_start = i curr_sse = sse[i] # Add last secondary structure element to annotation _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( ax, annotation, symbols_per_line=150, loc_range=(first_id, first_id + len(sse)), show_numbers=True, show_line_position=True, feature_plotters=[HelixPlotter(), SheetPlotter()]) fig.tight_layout() plt.show()
def test_feature_without_id(): """ A feature without 'ID' should raise an error if it has multiple locations and consequently multiple entries in the GFF3 file. """ annot = seq.Annotation( [seq.Feature( key = "CDS", locs = [seq.Location(1,2), seq.Location(4,5)], qual = {"some" : "qualifiers"} )] ) file = gff.GFFFile() with pytest.raises(ValueError): gff.set_annotation(file, annot)
def make_feature_maps(gene): try: find_id = entrez.fetch(gene, gettempdir(), suffix="gb", db_name="nuccore", ret_type="gb") read_file = gb.GenBankFile.read(find_id) file_annotation = gb.get_annotation(read_file) except: flash('The entered gene could not found. Please try again.', 'error') return None key_list = [] for feature in file_annotation: keys = feature.key key_list.append(keys) if feature.key == "source": # loc_range has exclusive stop loc = list(feature.locs)[0] loc_range = (loc.first, loc.last + 1) Unique_key = np.unique(key_list) pwd = os.getcwd() Unique_key = np.unique(key_list) for j in range(len(Unique_key)): i = Unique_key[j] fig, ax = plt.subplots(figsize=(8.0, 2.0)) graphics.plot_feature_map(ax, seq.Annotation([ feature for feature in file_annotation if feature.key == i ]), multi_line=False, loc_range=loc_range, show_line_position=True) plt.title('This plot is for {} features'.format(i)) plt.savefig(pwd + '/app/static/images/{}.png'.format(i), dpi=300) session['valid_gene'] = True return None
def visualize_secondary_structure(sse, first_id, linesize=200): length = sse.shape[0] def _add_sec_str(annotation, first, last, str_type): if str_type == "a": str_type = "helix" elif str_type == "b": str_type = "sheet" else: # coil return feature = seq.Feature("SecStr", [seq.Location(first, last)], {"sec_str_type": str_type}) annotation.add_feature(feature) # Find the intervals for each secondary ssqa element # and add to annotation annotation = seq.Annotation() curr_sse = None curr_start = None for i in range(len(sse)): if curr_start is None: curr_start = i curr_sse = sse[i] else: if sse[i] != sse[i - 1]: _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) curr_start = i curr_sse = sse[i] # Add last secondary ssqa element to annotation _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) fig = plt.figure(figsize=(8.0, 3.0)) ax = fig.add_subplot(111) graphics.plot_feature_map( ax, annotation, symbols_per_line=linesize, loc_range=(1, length + 1), show_numbers=True, show_line_position=True, feature_plotters=[HelixPlotter(), SheetPlotter()]) fig.tight_layout()
def visualize_secondary_structure(sse, first_id): def _add_sec_str(annotation, first, last, str_type): if str_type == "a": str_type = "helix" elif str_type == "b": str_type = "sheet" else: # coil return feature = seq.Feature("SecStr", [seq.Location(first, last)], {"sec_str_type": str_type}) annotation.add_feature(feature) # Find the intervals for each secondary structure element # and add to annotation annotation = seq.Annotation() curr_sse = None curr_start = None for i in range(len(sse)): if curr_start is None: curr_start = i curr_sse = sse[i] else: if sse[i] != sse[i - 1]: _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) curr_start = i curr_sse = sse[i] # Add last secondary structure element to annotation _add_sec_str(annotation, curr_start + first_id, i - 1 + first_id, curr_sse) feature_map = graphics.FeatureMap(annotation, line_length=150, loc_range=(1, length + 1)) feature_map.add_location_numbers(size=50) feature_map.drawfunc["SecStr"] = draw_secondary_strucure return feature_map.generate()
######################################################################## # Similarily to :class:`Alignment` objects, we can visualize an # Annotation in a *feature map*. # In order to avoid overlaping features, we draw only the *CDS* feature. # Get the range of the entire annotation via the *source* feature for feature in annotation: if feature.key == "source": # loc_range has exclusive stop loc = list(feature.locs)[0] loc_range = (loc.first, loc.last + 1) fig, ax = plt.subplots(figsize=(8.0, 1.0)) graphics.plot_feature_map( ax, seq.Annotation([feature for feature in annotation if feature.key == "CDS"]), multi_line=False, loc_range=loc_range, show_line_position=True) fig.tight_layout() ######################################################################## # :class:`Annotation` objects can be indexed with slices, that represent # the start and the stop base/residue of the annotation from which the # subannotation is created. # All features, that are not in this range, are not included in the # subannotation. # In order to demonstrate this indexing method, we create a # subannotation that includes only features in range of the gene itself # (without the regulatory stuff).
y, dx, dy, self._tail_width * bbox.height, self._head_width * bbox.height, # Create head with 90 degrees tip # -> head width/length ratio = 1/2 head_ratio=0.5, draw_head=draw_head, color=biotite.colors["orange"], linewidth=0)) # Test our drawing functions with example annotation annotation = seq.Annotation([ seq.Feature("SecStr", [seq.Location(10, 40)], {"sec_str_type": "helix"}), seq.Feature("SecStr", [seq.Location(60, 90)], {"sec_str_type": "sheet"}), ]) fig = plt.figure(figsize=(8.0, 0.8)) ax = fig.add_subplot(111) graphics.plot_feature_map( ax, annotation, multi_line=False, loc_range=(1, 100), # Register our drawing functions feature_plotters=[HelixPlotter(), SheetPlotter()]) fig.tight_layout() ######################################################################## # Now let us do some serious application.
# Plot hydropathy ax.plot(np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), hydropathies, color=biotite.colors["dimorange"]) ax.axhline(0, color="gray", linewidth=0.5) ax.set_xlim(1, len(hcn1) + 1) ax.set_xlabel("HCN1 sequence position") ax.set_ylabel("Hydropathy (15 residues moving average)") # Draw boxes for annotated transmembrane helices for comparison # with hydropathy plot annotation = gb.get_annotation(gp_file, include_only=["Region"]) transmembrane_annotation = seq.Annotation([ feature for feature in annotation if feature.qual["region_name"] == "Transmembrane region" ]) for feature in transmembrane_annotation: first, last = feature.get_location_range() ax.axvspan(first, last, color=(0.0, 0.0, 0.0, 0.2), linewidth=0) # Plot similarity score as measure for conservation ax2 = ax.twinx() ax2.plot(np.arange(1 + ma_radius, len(hcn1) - ma_radius + 1), scores, color=biotite.colors["brightorange"]) ax2.set_ylabel("Similarity score (15 residues moving average)") ax.legend(handles=[ Patch(color=biotite.colors["dimorange"], label="Hydropathy"),
'start_aa': [start_aa], 'end_aa': [aa], 'sec_str_type': [previous_ss_seg] }) ss_segments = ss_segments.append(ss_unit_entry) last_ss = ss # At this point the df ss_segments also contains 'L' linkers # Create new df to store only those relevent for plotting ss_segments_plot = ss_segments.query('sec_str_type != "L"') #%% annotation = seq.Annotation() for _, start_aa, end_aa, ss_type in ss_segments_plot.itertuples(): if ss_type == "H": ss_type = "helix" elif ss_type == "S": ss_type = "sheet" feature = seq.Feature("SecStr", [seq.Location(start_aa, end_aa)], {"sec_str_type": ss_type}) annotation.add_feature(feature) #%% class HelixPlotter(graphics.FeaturePlotter): def __init__(self):
annotation = seq.Annotation([ seq.Feature("source", [seq.Location(0, 1500)], {"organism": "Escherichia coli"}), # Ori seq.Feature("rep_origin", [seq.Location(600, 700, seq.Location.Strand.REVERSE)], { "regulatory_class": "promoter", "note": "MyProm" }), # Promoter seq.Feature("regulatory", [seq.Location(1000, 1060)], { "regulatory_class": "promoter", "note": "MyProm" }), seq.Feature("protein_bind", [seq.Location(1025, 1045)], {"note": "repr"}), # Gene A seq.Feature("regulatory", [seq.Location(1070, 1080)], {"regulatory_class": "ribosome_binding_site"}), seq.Feature("CDS", [seq.Location(1091, 1150)], {"product": "geneA"}), # Gene B seq.Feature("regulatory", [seq.Location(1180, 1190)], {"regulatory_class": "ribosome_binding_site"}), seq.Feature("CDS", [seq.Location(1201, 1350)], {"product": "geneB"}), seq.Feature("regulatory", [seq.Location(1220, 1230)], {"regulatory_class": "ribosome_binding_site"}), seq.Feature("CDS", [seq.Location(1240, 1350)], {"product": "geneB2"}), # Gene C seq.Feature("regulatory", [seq.Location(1380, 1390)], {"regulatory_class": "ribosome_binding_site"}), seq.Feature( "CDS", # CDS extends over periodic boundary -> two locations [seq.Location(1, 300), seq.Location(1402, 1500)], {"product": "geneC"}), # Terminator seq.Feature("regulatory", [seq.Location(310, 350)], { "regulatory_class": "terminator", "note": "MyTerm" }), # Primers # The labels will be too long to be displayed on the map # If you want to display them nevertheless, set the # 'omit_oversized_labels' to False seq.Feature("primer_bind", [seq.Location(1385, 1405)], {"note": "geneC"}), seq.Feature("primer_bind", [seq.Location(345, 365, seq.Location.Strand.REVERSE)], {"note": "geneC_R"}), # Terminator seq.Feature("regulatory", [seq.Location(310, 350)], { "regulatory_class": "terminator", "note": "MyTerm" }), ])
def ss_csv_to_annotation(csv_path=str): # Codes retained for debugging # dataRootDir=r'W:\Data storage & Projects\PhD Project_Trevor Ho\3_Intein-assisted Bisection Mapping' # dataFolderDir='BM010\ECF20_structure_model' # exported_ss = pd.read_csv(os.path.join(dataRootDir,dataFolderDir,'ECF20_ExPASy_sec_struct.csv')) exported_ss = pd.read_csv(csv_path) ss_segments = pd.DataFrame() # Take info for the first ss segment without knowing when it ends start_aa, last_ss = exported_ss.iloc[0] previous_ss_seg = last_ss seq_end_aa, _ = exported_ss.iloc[ -1] # for recording the last segment of ss for _, aa, ss in exported_ss.itertuples(): # Only when a new ss sgement is detected would an entry # for the previous ss segment be recorded if ss != last_ss: ss_unit_entry = pd.DataFrame({ 'start_aa': [start_aa], 'end_aa': [aa - 1], 'sec_str_type': [previous_ss_seg] }) ss_segments = ss_segments.append(ss_unit_entry) previous_ss_seg = ss start_aa = aa if aa == seq_end_aa: ss_unit_entry = pd.DataFrame({ 'start_aa': [start_aa], 'end_aa': [aa], 'sec_str_type': [previous_ss_seg] }) ss_segments = ss_segments.append(ss_unit_entry) last_ss = ss # At this point the df ss_segments also contains 'L' linkers # Create new df to store only those relevent for plotting ss_segments_plot = ss_segments.query('sec_str_type != "L"') annotation = seq.Annotation() for _, start_aa, end_aa, ss_type in ss_segments_plot.itertuples(): if ss_type == "H": ss_type = "helix" elif ss_type == "S": ss_type = "sheet" feature = seq.Feature("SecStr", [seq.Location(start_aa, end_aa)], {"sec_str_type": ss_type}) annotation.add_feature(feature) return annotation