def t_gff3_to_gff3(self): """Read in and write out GFF3 without any loss of information. """ recs = SeqIO.to_dict(GFF.parse(self._test_gff_file)) out_handle = StringIO.StringIO() GFF.write(recs.values(), out_handle) wrote_handle = StringIO.StringIO(out_handle.getvalue()) recs_two = SeqIO.to_dict(GFF.parse(wrote_handle)) orig_rec = recs.values()[0] re_rec = recs.values()[0] assert len(orig_rec.features) == len(re_rec.features) for i, orig_f in enumerate(orig_rec.features): assert str(orig_f) == str(re_rec.features[i])
def not_t_full_celegans(self): """Test the full C elegans chromosome and GFF files. This is used to test GFF on large files and is not run as a standard test. You will need to download the files and adjust the paths to run this. """ # read the sequence information seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa") gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3") seq_handle = open(seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() #with open(gff_file) as gff_handle: # possible_limits = feature_adder.available_limits(gff_handle) # pprint.pprint(possible_limits) rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info): pass
def gene_to_TSS(gene_name): #Initialize variables transcription_start = '' strand = '' chromosome = '' found_gene = False #Open annotation file annotation_file = 'crispr_app/Homo_sapiens.GRCh38.84.gtf' limit_info = dict( gff_type = ["transcript"]) annotation_handle = open(annotation_file) #Parse through annotated data, searching for matching gene names for rec in GFF.parse(annotation_handle, limit_info=limit_info, target_lines=1): feature = rec.features[0] qualifiers = feature.qualifiers #Once matching gene is found, determine the transcription start site and chromosome if str(qualifiers['gene_name']).strip('[').strip(']').strip('\'') == gene_name: found_gene = True chromosome = rec.id strand = feature.strand if strand == 1: if not transcription_start: transcription_start = float('inf') transcription_start = min(int(feature.location.start), int(transcription_start)) elif strand == -1: if not transcription_start: transcription_start = -1 transcription_start = max(int(feature.location.end), int(transcription_start)) elif found_gene == True: break annotation_handle.close() return (transcription_start, strand, chromosome)
def handle(self, *args, **options): organism, created = Organism.objects.get_or_create( common_name=options['organism_name'], taxon=options['taxon'], ebi_id=options['ebi_id'] ) for record in SeqIO.parse(options['fasta'], "fasta"): refseq, created = RefSeq.objects.get_or_create( name=record.id, length=len(record.seq), organism=organism ) for rec in GFF.parse(options['gff3']): rs = RefSeq.objects.get(name=rec.id, organism=organism) for feat in rec.features: if feat.type != 'gene': continue gene, created = Gene.objects.get_or_create( start=feat.location.start, end=feat.location.end, strand=feat.location.strand, refseq=rs, db_object_id=feat.id, db_object_symbol=feat.id )
def get_gff_dict(gfffile): """Creates a dictionary with product information from given gff file. Returns dictionary. Dictionary key is the contig id, values are products for the contig.""" out_dict = {} for rec in GFF.parse(gfffile): # Add features if there are any if rec.features > 0: gff_info = None # Add all features # Features are separated by , # example: # featuretype;product;product,featuretype;product # or # CDS;protein3;protein31,CDS;protein3 for f in rec.features: if len(f.qualifiers['product']) > 0: # if gff_info is None, do not add ',' separator try: gff_info += ",%s" % ";".join([f.type] + f.qualifiers['product']) except TypeError: gff_info = ";".join([f.type] + f.qualifiers['product']) # Test if there were any features with a product if gff_info == None: gff_info = "N/A" else: gff_info = "N/A" out_dict[rec.id] = gff_info return out_dict
def gene_to_early_exons(gene_name, num_exons): #Initialize variables exons = {} exonCount = 0 maxExons = 0 #Open annotation file annotation_file = 'crispr_app/Homo_sapiens.GRCh38.84.gtf' limit_info = dict( gff_type = ["exon"]) annotation_handle = open(annotation_file) #Parse through annotated data, searching for matching gene names & exons strand = '' for rec in GFF.parse(annotation_handle, limit_info=limit_info, target_lines=1): feature = rec.features[0] qualifiers = feature.qualifiers #Once matching gene is found, determine the exon regions and chromosome if str(qualifiers['gene_name']).strip('[').strip(']').strip('\'') == gene_name: chromosome = rec.id strand = feature.strand #Get only first version of gene in annotated data exonNum = str(qualifiers['exon_number']).strip('[').strip(']').strip('\'') maxExons = max(maxExons, int(exonNum)) exonCount +=1 if exonCount > maxExons: break if exonCount > num_exons: break exons[exonNum] = [int(feature.location.start), int(feature.location.end), strand] annotation_handle.close() return exons, chromosome
def t_fasta_directive(self): """Parse FASTA sequence information contained in a GFF3 file. """ recs = SeqIO.to_dict(GFF.parse(self._gff_file)) assert len(recs) == 1 test_rec = recs['chr17'] assert str(test_rec.seq) == "GATTACAGATTACA"
def load_gff(gff): """Parses a single GFF file and returns a chromosome-indexed dict for that file. Arguments --------- gff: str Filepath to GFF Returns ------- dict: A dictionary representation of the GFF entries, indexed by chromosome ID """ annotations = {} if gff.endswith('.gz'): import gzip from io import TextIOWrapper fp = TextIOWrapper(gzip.open(gff)) else: fp = open(gff) for entry in GFF.parse(fp): if len(entry.features) > 0 and entry.features[0].type == 'chromosome': annotations[entry.id] = entry fp.close() return annotations
def gene_positions(genefile, include_chromosome=True, include_strand=True, coding_only=False, ignore_strange_cases=False): """ Return a gene_ID:(chromosome, strand, start_pos, end_pos) dictionary based on GFF input file. The positions are 1-based, end-inclusive. If include_chromosome and/or include_strand is False, the corresponding values are missing from the output tuples. If coding_only is True, the start/end positions are the start and end of the first and last exon (i.e. excluding the UTRs). In that case, if a gene doesn't have an mRNA with exons, or has multiple mRNAs, raise an Exception, unless ignore_strange_cases is True, then just don't include it in the output. """ gene_positions = {} with open(os.path.expanduser(genefile)) as GENEFILE: # if coding_only is False, only look at genes, not sub-features genefile_parsing_limits = {'gff_type': ['gene']} if not coding_only else {} for chromosome_record in GFF.parse(GENEFILE, limit_info=genefile_parsing_limits): for gene_record in chromosome_record.features: # BCBio uses 0-based and end-exclusive positions (first-third base is bases 0,1,2, i.e range 0-3) - # convert to 1-based end-inclusive (so first-third base is bases 1,2,3, i.e. range 1-3) if include_chromosome: full_pos_info = (chromosome_record.id,) else: full_pos_info = () if include_strand: full_pos_info += (GFF_strands[gene_record.strand],) if not coding_only: full_pos_info += get_feature_start_end(gene_record) else: try: start_end = get_gene_start_end_excluding_UTRs(gene_record) except (NoRNAError, MultipleRNAError): if ignore_strange_cases: continue else: raise full_pos_info += start_end gene_positions[gene_record.id] = full_pos_info return gene_positions
def shortrna_regions(mirna_gff, star_csv, seq_file): """Return miRNA sequences with corresponding guide and star regions. """ seq_index = SeqIO.index(seq_file, "fasta") mirna_seqs = dict() with open(star_csv) as in_handle: for name, guide, star in csv.reader(in_handle): mirna_seqs[name] = (guide.strip(), star.strip()) for rec in GFF.parse(mirna_gff): cur_seq = str(seq_index[rec.id].seq) for f in rec.features: name = f.qualifiers["ID"][0] start, end = (f.location.nofuzzy_start, f.location.nofuzzy_end) yield (rec.id, start, end, name) #guide, star = mirna_seqs.get(name, ("", "")) for seq_name, guide, star in [(n, g, s) for n, (g, s) in mirna_seqs.iteritems() if n.startswith(name)]: for find_seq, ext in [(guide, "guide"), (star, "star")]: if find_seq: if f.strand == -1: find_seq = str(Seq(find_seq).reverse_complement()) region = cur_seq[start:end] pos = region.find(find_seq) if pos > -1: yield (rec.id, start + pos, start + pos + len(find_seq), "%s_%s" % (seq_name, ext)) else: print f.strand, name, ext, pos, find_seq, region raise NotImplementedError
def rebase(parent, child, interpro=False, protein2dna=False): child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): replacement_features = [] for feature in feature_lambda( rec.features, feature_test_qual_value, { 'qualifier': 'ID', 'attribute_list': child_features.keys(), }, subfeatures=False): new_subfeatures = child_features[feature.id] fixed_subfeatures = [] for x in new_subfeatures: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_subfeatures.append(x) replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features rec.annotations = {} GFF.write([rec], sys.stdout)
def t_ensembl_nested_features(self): """Test nesting of features with GFF2 files using transcript_id. """ rec_dict = SeqIO.to_dict(GFF.parse(self._ensembl_file)) assert len(rec_dict["I"].features) == 2 t_feature = rec_dict["I"].features[0] assert len(t_feature.sub_features) == 32
def rebase(parent, child, interpro=False, protein2dna=False): child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): # TODO, replace with recursion in case it's matched against a # non-parent feature. We're cheating a bit here right now... replacement_features = [] for feature in rec.features: if feature.id in child_features: new_subfeatures = child_features[feature.id] # TODO: update starts fixed_subfeatures = [] for x in new_subfeatures: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_subfeatures.append(x) replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features GFF.write([rec], sys.stdout)
def prepareSample(filter_matrix, gff_path): random.seed() candidate_list = [] handle = open(gff_path, 'r') gene_count = 0 for record in GFF.parse(handle): for feature in record.features: if feature.type == 'gene': locus_tag = feature.qualifiers['locus_tag'][0] isMatch = False gene_count += 1 for key in filter_matrix: if key == locus_tag: isMatch = True break if isMatch == False: candidate_list.append(locus_tag) countToAdd = round(gene_count / 2) - len(filter_matrix) if countToAdd > 0: for i in range(1, countToAdd): list_len = len(candidate_list) list_id = random.randint(0, list_len - 1) locus_str = candidate_list[ list_id ] filter_matrix[locus_str] = (0, 0) candidate_list.remove( locus_str ) handle.close() return(filter_matrix)
def read_gff_transcripts(fobj, fname="", min_exons=1, merge=0): # Setup logging logger = logging.getLogger('pita') if merge > 0: logger.warning("Merging exons not yet implemented for GFF files!") #limits = dict(gff_type = ["mRNA", "exon"]) smap = {"1":"+",1:"+","-1":"-",-1:"-", None:"+"} transcripts = [] for rec in GFF.parse(fobj): chrom = rec.id for feature in rec.features: #logger.debug("feature: {0}", feature) for gene in _gff_type_iterator(feature, ['mRNA', 'transcript', 'inferred_parent']): #logger.debug("Adding gene: {0}", gene) exons = [] #logger.debug("subfeatures: {0}", gene.sub_features) for exon in [f for f in gene.sub_features if f.type == 'exon']: #link[gene.id] = link.setdefault(gene.id, 0) + 1 start = int(exon.location.start.position)# - 1 end = int(exon.location.end.position) strand = smap[exon.strand] exons.append([chrom, start, end, strand]) logger.debug("%s: %s - %s exons", fname, gene.id, len(exons)) if len(exons) >= min_exons: transcripts.append([gene.id, fname, exons]) return transcripts
def doWork( args ): panel=Panel(fig_width=900, padding = 25, grid=None, xmin=0) seq_length = 0 for gff in args.gffs: seqrecord = GFF.parse(gff).next() if len(seqrecord) > seq_length: seq_length = len(seqrecord) #seqrecord = SeqIO.parse(args.infile, "genbank").next() cds_track = tracks.BaseTrack(sort_by = 'collapse') for feature in seqrecord.features: if feature.type == 'CDS': #print feature.qualifiers['product'] if feature.qualifiers['product'][0] == 'hypothetical protein': col = '#BDBDBD' else: col = '#2B8CBE' feat = features.GenericSeqFeature(feature, color_by_cm=False, fc=col ) cds_track.append(feat) elif feature.type == 'source': cds_track.append(features.GenericSeqFeature(feature, color_by_cm=False, alpha=0.0, fc='1.0', ec='1.0')) else: cds_track.append(features.GenericSeqFeature(feature, color_by_cm=False, fc='0.0', ec='0.0')) panel.add_track(cds_track) panel.save(args.outfile, xmin=0,xmax=seq_length)
def main(gff_file, fasta_file = None): # Use splitext to remove the extension of the original input file out_file = "%s.gb" % os.path.splitext(gff_file)[0] # Parser will differ slightly if fasta file is given if os.stat(gff_file) == 0 or ((fasta_file is not None) and os.stat(fasta_file)): print "ERROR: Empty file provided or cannot stat files" exit(64); elif fasta_file is None: gff_iter = GFF.parse(gff_file) #Parser/generator object else: fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) # Process fasta file gff_iter = GFF.parse(gff_file, fasta_input) # Give fasta file to parser # One line to call all the checking function and to write in genbank format SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
def read_gff_file(gfffile): featureid_locations={} limits=dict(gff_type=["gene","mRNA","CDS"]) with open(gfffile) as in_handle: for rec in GFF.parse(in_handle, limit_info=limits): for feature in rec.features: featureid_locations[feature.id] = rec.id return featureid_locations
def parse_gff(fname, ftype='unknown'): from BCBio import GFF entries = [] for e in GFF.parse(fname): for f in e.features: entries.append([e.id.lower(), int(f.location.start), int(f.location.end), f]) print len(entries), "entries read from GFF file", fname return entries
def main(argv): gtf_filename = '' seqfilename = '' feature = 'cds' seqtype = '' try: opts, args = getopt.getopt(argv,"hg:s:f:t:",["seqfile=","blastfile=", "feature=", "seqtype="]) except getopt.GetoptError: print 'Type GetOrthologGroups.py -h for options' sys.exit(2) for opt, arg in opts: if opt == "-h": print 'GetOrthologGroups.py -g <gtf_file> -s <seqfile> -f <feature> -t <seqtype>' sys.exit() elif opt in ("-g", "--gtffile"): gtf_filename = arg elif opt in ("-s", "--seqfile"): seqfilename = arg elif opt in ("-f", "--feature"): feature = arg elif opt in ("-t", "--seqtype"): seqtype = arg if seqtype != "contigs" and seqtype != "consensus": sys.exit("seqtype must be either 'contigs' or 'consensus'") seqfilehandle = open(seqfilename) seq_dict = SeqIO.to_dict(SeqIO.parse(seqfilehandle, "fasta")) seqfilehandle.close() gtf_filehandle = open(gtf_filename) for SeqRec in GFF.parse(gtf_filehandle, base_dict=seq_dict): if not SeqRec.features: continue #Skip sequences that are not in the GFF #cluster_num = SeqRec.features[0] #print SeqRec.features[0].qualifiers['gene_id'][0] gene_id = SeqRec.features[0].qualifiers['gene_id'][0] cluster_num = gene_id.split('_')[1] if not cluster_num[0] =='c': #Skip sequences that match reference sequences that are not part of a cluster continue cluster_num = cluster_num[1:] if seqtype == "contigs": cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "ContigClusters", "Cluster_" + cluster_num + ".fa") elif seqtype == "consensus": cluster_filename = path.join(path.expanduser("~"), "Bioinformatics", "Selaginella", "ConsensusCLusters", "Cluster_" + cluster_num + ".fa") if feature == 'cds': subseq = SeqRec.features[0].extract(SeqRec) elif feature == 'contigs': if SeqRec.features[0].location.strand == -1: subseq = SeqRec.seq.reverse_complement() else: subseq = SeqRec.seq else: sys.exit("feature %s not recognized" % feature) subseq.id = gene_id subseq.description = gene_id cluster_file = open(cluster_filename, "a") cluster_file.write(subseq.format("fasta")) cluster_file.close() gtf_filehandle.close()
def main(infile, gff, outfile, ftype='CDS', use_phase=False, translate=False): ref_seq = SeqIO.to_dict(SeqIO.parse(infile, format="fasta")) # Parse GFF annotations. genome_with_features = GFF.parse( gff, base_dict=ref_seq ) """ bcbio-gff codes exons, mRNA etc as subfeatures which is now depreciated in biopython, this code fixes that issue. """ new_genome_with_features = list() for scaffold in genome_with_features: new_features = list() for feature in scaffold.features: gene_features = subfeatures(feature) new_features.extend(gene_features) scaffold.features = new_features new_genome_with_features.append(scaffold) """ Genome with features doesn't have scaffolds without any gff features. Here I update the existing records in genome with the new ones containing features. """ ref_seq.update(SeqIO.to_dict(new_genome_with_features)) sequences = list() for scaffold, sequence in ref_seq.items(): for feature in sequence.features: if feature.type != ftype: continue start = feature.location.start end = feature.location.end try: phase = int(feature.qualifiers['phase'][0]) except KeyError: phase = 0 strand = feature.location.strand if use_phase: fseq = feature.extract(sequence)[phase:] else: fseq = feature.extract(sequence) fseq.id = feature.id fseq.name = feature.id strand = '-' if strand == -1 else '+' fseq.description = "{}:{}-{}[{}]".format( scaffold, start, end, strand, ) if translate: tseq = fseq.seq.translate() fseq.seq = tseq sequences.append(fseq) SeqIO.write(sequences, outfile, 'fasta') return
def main(gff_file, ref_file, ofile, seq_type="CDS"): with open(ref_file) as in_handle: fasta_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta")) base, ext = os.path.splitext(gff_file) gff_iter = GFF.parse(gff_file, fasta_recs) recs = protein_recs(check_gff(gff_iter), fasta_recs, seq_type) SeqIO.write(recs, ofile, "fasta")
def main(expterm, fasta, gff3): with open(fasta, 'r') as handle: seq_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) # Build coords file with open(gff3, 'r') as handle: for rec in GFF.parse(handle, base_dict=seq_dict): with open('tmp.coords', 'w') as coords: for feat in rec.features: if feat.type == 'gene': coords.write('\t'.join([ feat.id, str(feat.location.start + 1), str(feat.location.end), rec.id, ]) + '\n') with open('tmp.fasta', 'w') as fasta_handle: SeqIO.write(rec, fasta_handle, 'fasta') cmd = ['transterm', '-p', expterm, fasta, 'tmp.coords'] output = subprocess.check_output(cmd) # TERM 1 4342 - 4366 + F 93 -11.5 -3.22878 | opp_overlap 4342, overlap 4340 4357 ttre = re.compile( r'^ (?P<name>.*) (?P<start>\d+) - (?P<end>\d+)\s+' r'(?P<strand>[-+])\s+(?P<loc>[GFRTHNgfr]+)\s+' r'(?P<conf>\d+)\s+(?P<hp>[0-9.-]+)\s+(?P<tail>[0-9.-]+)' ) rec.features = [] batches = output.split('SEQUENCE ') for batch in batches[1:]: batch_lines = batch.split('\n') # Strip the header interesting = batch_lines[2:] unformatted = [x for x in interesting if x.startswith(' ')][0::2] for terminator in unformatted: m = ttre.match(terminator) if m: start = int(m.group('start')) - 1 end = int(m.group('end')) if m.group('strand') == '+': strand = 1 else: strand = 0 feature = SeqFeature( FeatureLocation(start, end), type="terminator", strand=strand, qualifiers={ "source": "TransTermHP_2.09", "score": m.group('conf'), "ID": m.group('name'), } ) rec.features.append(feature) yield rec
def main( gff_file, fasta_file, outfile, oformat ): fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) gff_iter = GFF.parse(gff_file, fasta_input) gff_iter = add_translation(gff_iter) if oformat in ['genbank', 'gb']: SeqIO.write(check_gff(fix_ncbi_id(gff_iter)), outfile, oformat) else: SeqIO.write(check_gff(gff_iter), outfile, oformat)
def load_gff(gff_file): """Returns a list of parsed gff.""" with open(gff_file,"r") as f: print "Parsing file {}...".format(gff_file) rec = [] for line in GFF.parse(f): rec.append(line) return rec
def load_features(reference, feature_names=None): #read in appropriately whether GFF or Genbank #checks explicitly for GFF otherwise assumes Genbank if not os.path.isfile(reference): print("ERROR: reference sequence not found. looking for", reference) return None features = {} if '.gff' in reference.lower(): #looks for 'gene' and 'gene' as best for TB try: from BCBio import GFF #Package name is confusing - tell user exactly what they need! except ImportError: print("ERROR: Package BCBio.GFF not found! Please install using \'pip install bcbio-gff\' before re-running.") return None limit_info = dict( gff_type = ['gene'] ) with open(reference) as in_handle: for rec in GFF.parse(in_handle, limit_info=limit_info): for feat in rec.features: if feature_names is not None: #check both tags; user may have used either if "gene" in feat.qualifiers and feat.qualifiers["gene"][0] in feature_names: fname = feat.qualifiers["gene"][0] elif "locus_tag" in feat.qualifiers and feat.qualifiers["locus_tag"][0] in feature_names: fname = feat.qualifiers["locus_tag"][0] else: fname = None else: if "gene" in feat.qualifiers: fname = feat.qualifiers["gene"][0] else: fname = feat.qualifiers["locus_tag"][0] if fname: features[fname] = feat if feature_names is not None: for fe in feature_names: if fe not in features: print("Couldn't find gene {} in GFF or GenBank file".format(fe)) else: from Bio import SeqIO for feat in SeqIO.read(reference, 'genbank').features: if feat.type=='CDS': if "locus_tag" in feat.qualifiers: fname = feat.qualifiers["locus_tag"][0] if feature_names is None or fname in feature_names: features[fname] = feat elif "gene" in feat.qualifiers: fname = feat.qualifiers["gene"][0] if feature_names is None or fname in feature_names: features[fname] = feat elif feat.type=='source': #read 'nuc' as well for annotations - need start/end of whole! features['nuc'] = feat return features
def t_unescaped_semicolons(self): """Parse inputs with unescaped semi-colons. This is a band-aid to not fail rather than correct parsing, since the combined feature will not be maintained. """ f = os.path.join(self._test_dir, "unescaped-semicolon.gff3") rec_dict = SeqIO.to_dict(GFF.parse(f)) f = rec_dict['chr1'].features[0] assert f.qualifiers["Description"][0].startswith('osFTL6') assert f.qualifiers["Description"][0].endswith('protein, expressed')
def t_wb_cds_nested_features(self): """Nesting of GFF2 features with a flat CDS key value pair. """ rec_dict = SeqIO.to_dict(GFF.parse(self._wb_alt_file)) assert len(rec_dict) == 2 features = rec_dict.values()[1].features assert len(features) == 1 tfeature = features[0] assert tfeature.id == "cr01.sctg102.wum.2.1" assert len(tfeature.sub_features) == 7
def load_gff(db, gff_file, fasta_file, fetch_taxonomy=False, taxid=None): from BCBio import GFF with open(fasta_file) as seq_handle: seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) saved = [] for rec in GFF.parse(gff_file, seq_dict ): saved.append(add_taxid(rec, taxid)) db.load(saved, fetch_NCBI_taxonomy=fetch_taxonomy)
def t_gff2_iteration(self): """Test iterated features with GFF2 files, breaking without parents. """ recs = [] for rec in GFF.parse(self._wormbase_file, target_lines=15): recs.append(rec) assert len(recs) == 4 assert recs[0].features[0].type == 'region' assert recs[0].features[1].type == 'SAGE_tag' assert len(recs[0].features[2].sub_features) == 29
#!/usr/bin/env python import sys import argparse from BCBio import GFF from gff3 import feature_lambda, feature_test_type if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('gff3', type=argparse.FileType("r"), help='GFF3 annotations') parser.add_argument('types', type=str, nargs='+', help='Feature type to filter on') parser.add_argument('--invert', action='store_true') args = parser.parse_args() for rec in GFF.parse(args.gff3): rec.features = feature_lambda( rec.features, feature_test_type, {'types': args.types}, invert=args.invert, subfeatures=False, ) GFF.write([rec], sys.stdout)
def parse_gff(self, inputGFF): ''' get a list of contigs plus 0-indexed gene-coordinates and sense-ness of protein coding regions from a gff file. Only tested with prokka GFF files. ''' from BCBio import GFF import Bio import re import warnings def rev_comp(string): string = string.upper() complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N' : 'N'} bases = list(string) bases = [complement[base] for base in bases] bases.reverse() return ''.join(bases) try: with open(inputGFF) as in_handle: _ = next(GFF.parse(in_handle)) except: print ('Parsing of GFF failed. This is probably because your biopython version is too new. Try downgrading to 1.76 or older') sys.exit(1) with open(inputGFF) as in_handle: for rec in GFF.parse(in_handle): tmp = [] for r in rec.features: if "minced" in r.qualifiers['source'][0] or "Minced" in r.qualifiers['source'][0]: # This catches CRISPR repeats. continue if r.sub_features: prodigal_bool = 'Prodigal' in r.sub_features[0].qualifiers['source'][0] or 'prodigal' in r.sub_features[0].qualifiers['source'][0] else: prodigal_bool = 'Prodigal' in r.qualifiers['source'][0] or 'prodigal' in r.qualifiers['source'][0] if prodigal_bool: # Prokka not only finds protein sequences, but also t-/r-RNA sequences. In order to only parse protein coding sequences, # I search for Prodigal/Prodigal in the source entry of the sub_features attribute. # the sub_features attribute of a seq_record object is apparently deprecated. I couldn't find any other way to access # the required information, though. Should probably be fixed when I can. indices = str(r.location).split('[')[1].split(']')[0].split(':') indices = [int(x) for x in indices] sense = str(r.location).split('(')[1].split(')')[0] if sense == "-": gene_seq = rev_comp(rec.seq[indices[0]:indices[1]]) else: gene_seq = rec.seq[indices[0]:indices[1]] if (str(gene_seq[0:3]) == "ATG" or str(gene_seq[0:3]) == "GTG" or str(gene_seq[0:3]) == "TTG"): pass else: warnings.warn(str(r.id) + " doesn't start with a common start codon. Beware. Continuing.") if (str(gene_seq[-3:]) == "TAG" or str(gene_seq[-3:]) == "TAA" or str(gene_seq[-3:]) == "TGA"): pass else: warnings.warn(str(r.id) + " doesn't stop with a usual stop codon. Beware. Continuing.") tmp.append((indices, sense)) if str(rec.id) in self.contigs: self.contigs[str(rec.id)].annotations.append(tmp) else: warnings.warn(str(rec.id) + " is not tracked by the BAMFile.")
def density_adjusted(fname, chr_sam, minlength, maxlength, path_wig, path_den, path_gff): '''Density will be a size separated dictionary = {length : [reads at 0, reads at 1, ....]} this makes it easier to select a size range later for analysis''' fname = fname chr_sam = chr_sam minlength = minlength maxlength = maxlength GFFgen = GFF.parse(path_gff) # open chr aligned sam file f_samfile = open(chr_sam) samfile = csv.reader(f_samfile, delimiter=' ') # dictionaries to hold read counts density_plus = {} density_minus = {} density_plus_sizesep = {} density_minus_sizesep = {} if minlength < 0 or maxlength < 0: print "Error. Length input not valid." return (0) # Makes 2 sets of indices, one for all reads, and another for size separated: for sequence in GFFgen: density_plus[sequence.id] = [0 for x in range(len(sequence) + 20)] density_minus[sequence.id] = [0 for x in range(len(sequence) + 20)] for length in range(minlength, maxlength + 1): density_plus_sizesep[length] = [0 for x in range(len(sequence) + 20)] density_minus_sizesep[length] = [0 for x in range(len(sequence) + 20)] total_reads = 0 mapped_reads = 0 # Loop through the samfile. for read in samfile: if read[0][0] == '@': # Ignore header lines. continue if read[1] == '4': # A bowtie mismatch. continue chrom = read[2] # chromosome identified for read in bowtie readid = read[0] # read id startp = int( read[3] ) - 1 # start position. Need to subtract 1 since genomic sequence starts at 1, seq = Seq.Seq(read[9]) # sequence of the read length = len(seq) # length of read if length < 23: length_shift = 24 - length else: length_shift = 0 if chrom not in density_plus.keys(): print "Error: Bowtie index and GFF do not match" total_reads += 1 # Note that Bowtie reverse complements any sequence aligning to the reverse strand. # and so read[3] is the 3'-end of minus strand reads # Filter to get rid of reads of particular length. Or a particular strand. if (length < minlength or length > maxlength): continue mapped_reads += 1 # 16 is the minus strand, 0 is the plus strand if (read[1] == '16'): start = startp - length_shift density_minus[chrom][start] += 1 density_minus_sizesep[length][start] += 1 if (read[1] == '0'): start = startp + length - 1 + length_shift density_plus[chrom][start] += 1 density_plus_sizesep[length][start] += 1 path_oldformat = path_den + "binary/" if not os.path.exists(path_oldformat): os.makedirs(path_oldformat) density_plus[sequence.id] = [ float(i) * 1000000 / float(mapped_reads) for i in density_plus[sequence.id] ] density_minus[sequence.id] = [ float(i) * 1000000 / float(mapped_reads) for i in density_minus[sequence.id] ] ribo_util.writebin(density_plus, path_oldformat + fname + "_plus_") ribo_util.makePickle(density_plus, path_den + "plus") ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep") ribo_util.countstowig(density_plus, path_wig + "_plus") ribo_util.writebin(density_minus, path_oldformat + fname + "_minus_") ribo_util.makePickle(density_minus, path_den + "minus") ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep") ribo_util.countstowig(density_minus, path_wig + "_minus")
def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=60): seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta")) CASES = [ re.compile('^.{%s,%s}[ACGSILMFTV][^REKD][GASNL]C' % (lipobox_mindist, lipobox_maxdist)), # re.compile('^.{%s,%s}AWAC' % (lipobox_mindist, lipobox_maxdist)), # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file ] for record in GFF.parse(gff3_file, base_dict=seq_dict): good_features = [] genes = list( feature_lambda(record.features, feature_test_type, {'type': 'gene'}, subfeatures=True)) for gene in genes: cdss = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=False)) if len(cdss) == 0: continue # Someday this will bite me in the arse. cds = cdss[0] try: tmpseq = str( cds.extract(record.seq).translate(table=11, cds=True)).replace( "*", "") except: continue for case in CASES: m = case.search(tmpseq) if m: if cds.location.strand > 0: start = cds.location.start + (3 * (m.end() - 4)) end = cds.location.start + (3 * m.end()) else: start = cds.location.end - (3 * (m.end() - 4)) end = cds.location.end - (3 * m.end()) tmp = SeqFeature(FeatureLocation( min(start, end), max(start, end), strand=cds.location.strand), type='Lipobox', qualifiers={ 'source': 'CPT_LipoRy', 'ID': '%s.lipobox' % get_id(gene), }) tmp.qualifiers['sequence'] = str( tmp.extract(record).seq.translate()) gene.sub_features.append(tmp) good_features.append(gene) record.features = good_features yield [record]
full = r[2] genes.add(dwg.rect(insert=(10, y), size=(full, 10), fill='black')) y += 15 for j, gene in enumerate(rec.features): # draw each gene in blue x = 10 + gene.location.start length = gene.location.end - gene.location.start genes.add(dwg.rect(insert=(x, y), size=(length, 10), fill='blue')) for sf in gene.sub_features: if sf.type == 'Shine_Dalgarno_sequence': x = 10 + sf.location.start length = sf.location.end - sf.location.start genes.add(dwg.rect(insert=(x, y), size=(length, 10), fill='blue')) y += 15 dwg.save() y += 20 if __name__ == '__main__': parser = argparse.ArgumentParser(description='output svgs for start sites') parser.add_argument('gff3', type=argparse.FileType("r"), help='gff3 file') args = parser.parse_args() records = GFF.parse(args.gff3) draw(records)
def load_gff3( self, organism, gff3, source=None, batch_size=1, test=False, use_name=False, disable_cds_recalculation=False, timing=False, ): """ Load a full GFF3 into annotation track :type organism: str :param organism: Organism Common Name :type gff3: str :param gff3: GFF3 to load :type source: str :param source: URL where the input dataset can be found. :type batch_size: int :param batch_size: Size of batches before writing :type test: bool :param test: Run in dry run mode :type use_name: bool :param use_name: Use the given name instead of generating one. :type disable_cds_recalculation: bool :param disable_cds_recalculation: Disable CDS recalculation and instead use the one provided :type timing: bool :param timing: Output loading performance metrics :rtype: str :return: Loading report """ organisms = self._wa.organisms.get_organisms() org_ids = [] for org in organisms: if organism == org['commonName'] or organism == str(org['id']): org_ids.append(org['id']) if len(org_ids) == 0: raise Exception("Organism name or id not found [" + organism + "]") if len(org_ids) > 1: raise Exception("More than one organism found for [" + organism + "]. Use an organism ID instead: " + str(org_ids)) total_features_written = 0 start_timer = default_timer() if timing: print( 'Times are in seconds. If batch-size > 1 then .(total_batch_time/avg_feature_time)' ) all_processed = {'top-level': [], 'transcripts': []} loading_status = {} for rec in GFF.parse(gff3): self.set_sequence(organism, rec.id) try: log.info("Processing %s with features: %s" % (rec.id, rec.features)) processed = self._process_gff_entry( rec, source=source, disable_cds_recalculation=disable_cds_recalculation, use_name=use_name) all_processed['top-level'].extend(processed['top-level']) all_processed['transcripts'].extend(processed['transcripts']) total_features_written += 1 written_top = self._check_write(batch_size, test, all_processed['top-level'], FeatureType.FEATURE, timing) written_transcripts = self._check_write( batch_size, test, all_processed['transcripts'], FeatureType.TRANSCRIPT, timing) if len(written_top): all_processed['top-level'] = [] loading_status = {**loading_status, **written_top} if len(written_transcripts): all_processed['transcripts'] = [] loading_status = {**loading_status, **written_transcripts} except Exception as e: msg = str(e) if '\n' in msg: msg = msg[0:msg.index('\n')] log.error("Failed to load features from %s" % rec.id) # Write the rest of things to write (ignore batch_size) written_top = self._check_write(0, test, all_processed['top-level'], FeatureType.FEATURE, timing) written_transcripts = self._check_write(0, test, all_processed['transcripts'], FeatureType.TRANSCRIPT, timing) if len(written_top): all_processed['top-level'] = [] loading_status = {**loading_status, **written_top} if len(written_transcripts): all_processed['transcripts'] = [] loading_status = {**loading_status, **written_transcripts} log.info("Finished loading") if timing: end_timer = default_timer() duration = end_timer - start_timer print( str(duration) + " seconds to write " + str(total_features_written) + " features") print("Avg write time (s) per feature: " + str('{:.3f}'.format(duration / total_features_written))) return loading_status
def catch_middle_stop(gff3_files, genome_assembly_file, output_dir): D_bad = defaultdict(bool) D_stop = defaultdict(int) D_toomanyX = defaultdict(int) D_gap = defaultdict(int) D_intron = defaultdict(int) for gff3_file in gff3_files: prefix = os.path.basename(os.path.splitext(gff3_file)[0]) # Import genome sequence in_seq_handle = open(genome_assembly_file) seq_dict = SeqIO.to_dict(SeqIO.parse(in_seq_handle, 'fasta')) in_seq_handle.close() # Import GFF3 in_handle = open(gff3_file) for rec in GFF.parse(in_handle, base_dict=seq_dict): gene_features = rec.features for gene_feature in gene_features: mrna_features = gene_feature.sub_features for mrna_feature in mrna_features: mrna_sub_features = mrna_feature.sub_features mrna_sub_features_s = sorted( mrna_sub_features, key=lambda x: x.location.start) seq_cds = [] coords = [] mrna_sub_features_s2 = [] for feature in mrna_sub_features_s: if feature.type != 'CDS': continue mrna_sub_features_s2.append(feature) seq_cds.append(rec.seq[feature.location.start:feature. location.end]) coords.append( (feature.location.start, feature.location.end)) i = 1 while i < len(coords): intron_start = coords[i - 1][1] intron_end = coords[i][0] intron_len = intron_end - intron_start if intron_len < 10: D_bad[(prefix, mrna_feature.id)] = True D_intron[prefix] += 1 i += 1 gene_seq = reduce(operator.add, seq_cds) # If strand is -, get reverse complementary sequence if mrna_feature.strand == -1: gene_seq = gene_seq.reverse_complement() phase = mrna_sub_features_s2[-1].qualifiers['phase'] else: phase = mrna_sub_features_s2[0].qualifiers['phase'] phase = int(phase[0]) gene_seq = gene_seq[phase:] protein_seq = str(gene_seq.translate()) # Check protein seq has stop codon in the middle protein_seq2 = re.sub('\*$', '', protein_seq) count_stop = protein_seq2.count('*') if count_stop > 0: D_bad[(prefix, mrna_feature.id)] = True D_stop[prefix] += 1 # Check if translation consists of more than 50% X residues len_prot = len(protein_seq2) len_X = protein_seq2.count('X') if len_X / len_prot > 0.5: D_bad[(prefix, mrna_feature.id)] = True D_toomanyX[prefix] += 1 # Check if feature begins or ends in gap gene_seq2 = str(gene_seq).lower() if gene_seq2.startswith('n') or gene_seq2.endswith('n'): D_bad[(prefix, mrna_feature.id)] = True D_gap[prefix] += 1 outfile_stats = os.path.join(output_dir, 'bad_genes_stats.txt') outhandle_stats = open(outfile_stats, 'w') run_names = D_stop.keys() header_txt = '{}\t{}\n'.format('type', '\t'.join(run_names)) outhandle_stats.write(header_txt) stop_list = [str(D_stop[x]) for x in run_names] toomanyX_list = [str(D_toomanyX[x]) for x in run_names] gap_list = [str(D_gap[x]) for x in run_names] intron_list = [str(D_intron[x]) for x in run_names] outhandle_stats.write('internal_stop\t{}\n'.format('\t'.join(stop_list))) outhandle_stats.write('start_with_gap\t{}\n'.format('\t'.join(gap_list))) outhandle_stats.write('toomanyX\t{}\n'.format('\t'.join(toomanyX_list))) outhandle_stats.write('short_intron\t{}\n'.format('\t'.join(intron_list))) D_bad_pickle = os.path.join(output_dir, 'D_bad.p') cPickle.dump(D_bad, open(D_bad_pickle, 'wb'))
def main(gff_file, fasta_file): out_file = "%s.gb" % os.path.splitext(gff_file)[0] fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) gff_iter = GFF.parse(gff_file, fasta_input) SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
def reformat(data): for record in GFF.parse(data): record.annotations = {} GFF.write([record], sys.stdout)
def gff3_to_genbank(gff_file, fasta_file, transltbl): fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna)) gff_iter = GFF.parse(gff_file, fasta_input) for record in gff_iter: yield handle_record(record, transltbl)
def mutate(gff3, fasta, changes, customSeqs, new_id): # Change Language # - we can only accept ONE genome as an input. (TODO: support multiple?) # - we can only build ONE genome as an output. (TODO: support multiple?) # - must allow selection of various regions # '1,1000,+ 40,100,- custom_seq_1' try: custom_seqs = SeqIO.to_dict(SeqIO.parse(customSeqs, "fasta")) except: custom_seqs = {} seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # Pull first and onl record rec = list(GFF.parse(gff3, base_dict=seq_dict))[0] # Create a "clean" record new_record = copy.deepcopy(rec) new_record.id = new_id new_record.seq = Seq('') new_record.features = [] new_record.annotations = {} # Process changes. chain = [] for change in changes: if ',' in change: (start, end, strand) = change.split(',') start = int(start) - 1 end = int(end) # Make any complaints broken_feature_start = list(feature_lambda(rec.features, feature_test_contains, {'index': start}, subfeatures=False)) if len(broken_feature_start) > 0: pass # log.info("DANGER: Start index chosen (%s) is in the middle of a feature (%s %s). This feature will disappear from the output", start, broken_feature_start[0].id, broken_feature_start[0].location) broken_feature_end = list(feature_lambda(rec.features, feature_test_contains, {'index': end}, subfeatures=False)) if len(broken_feature_end) > 0: pass # log.info("DANGER: End index chosen (%s) is in the middle of a feature (%s %s). This feature will disappear from the output", end, broken_feature_end[0].id, broken_feature_end[0].location) # Ok, fetch features if strand == '+': tmp_req = rec[start:end] else: tmp_req = rec[start:end].reverse_complement( id=True, name=True, description=True, features=True, annotations=True, letter_annotations=True, dbxrefs=True ) def update_location(feature): feature.location._start += len(new_record) feature.location._end += len(new_record) if hasattr(feature, 'sub_features'): for sf in feature.sub_features: update_location(sf) for feature in tmp_req.features: update_location(feature) chain.append([ rec.id, start + 1, end, strand, new_record.id, len(new_record) + 1, len(new_record) + (end - start), '+' ]) new_record.seq += tmp_req.seq # NB: THIS MUST USE BIOPYTHON 1.67. 1.68 Removes access to # subfeatures, which means you will only get top-level features. new_record.features += tmp_req.features else: new_record.seq += custom_seqs[change].seq yield new_record, chain
def readGFF(gff): with open(gff, 'r') as gff_file: for rec in GFF.parse(gff_file, limit_info=dict(gff_type = ["gene"])): genes = rec.features return genes
db = MySQLdb.connect(host="localhost", db="hg19", read_default_file="~/.my.cnf") cursor = db.cursor(MySQLdb.cursors.DictCursor) entryNumber = args.entryNumber # # Read over the gff file collecting data for ID mapping. # map the ID to what will be the displayed ID: the combination of the # name and accession number. For miRNAs, also record the ID of the # pre-miRNA that it was derived from. miRnaToPreMiRna = dict() idToLabel = dict() gffIter = GFF.parse(args.inputGff) for chrom in gffIter: for hit in chrom.features: id = hit.id label = "%s|%s" % (hit.qualifiers["Name"][0], hit.qualifiers["ID"][0]) idToLabel[id] = label if hit.type == "miRNA": miRnaToPreMiRna[hit.id] = hit.qualifiers["derives_from"][0] # # Read the bed file containing the GRCh37-lite coordinates. # While converting each line to GAF format, replace the ID # with the miRNA name, look up the name of the pre-miRNA that # the miRNA is derived from, and note that in the featureInfo field. miRnaBedFp = open(args.miRnaBed) for line in miRnaBedFp:
def from_TriTrypDB(name, gff, fasta, tax, tmp_dir=None): genome = {x.id: x for x in sp(fasta)} from BCBio import GFF import re annotation = list(GFF.parse(gff, base_dict=genome)) contig = annotation[0] seqCol = BioDocFactory.create_genome(name, contig, tax, Tax) seqCol.save() if not tmp_dir: tmp_dir = "/tmp/" + name + "/" mkdir(tmp_dir) gene_ids = {} with tqdm(annotation) as pbar: for contig in pbar: pbar.set_description(contig.id) if len(contig.seq) > 15000000: contig.seq = "" contigDoc, gene_ids2 = BioDocFactory.create_contig( contig, seqCol, type_map={ "rRNA": "rRNA", "ncRNA": "ncRNA", NCBI.f_mRNA: "gene", "exon": "exon", "gene": "gene", NCBI.f_CDS: NCBI.f_CDS, "rRNA": "rRNA", "tRNA": "tRNA", "tmRNA": "tmRNA", "snoRNA": "snoRNA", "three_prime_UTR": "three_prime_UTR", "five_prime_UTR": "five_prime_UTR" }) gene_ids.update(gene_ids2) contigDoc.save() prots = [] with tqdm(tritryp_protein_iter(annotation)) as pbar: for (protein, cds_f) in pbar: protDoc = Protein(seq=str(protein.seq), name=protein.id) if "description" in cds_f.qualifiers: protein_description = cds_f.qualifiers['description'][0] elif "Note" in cds_f.qualifiers: protein_description = cds_f.qualifiers['Note'][0] elif "product" in cds_f.qualifiers: protein_description = cds_f.qualifiers['product'][0] else: protein_description = "" protDoc.description = protein_description gos = [] if "Ontology_term" in cds_f.qualifiers: gos = [ x.lower() for x in cds_f.qualifiers["Ontology_term"] if "GO:" in x and ( x not in ["GO:0008150", "GO:0003674", "GO:0005575"]) ] note = cds_f.qualifiers["Note"][0].split( " ")[0] if "Note" in cds_f.qualifiers else "" ecs = ["ec:" + note] if re.match( '^[0-9]+\.[0-9\-]+\.[0-9\-]+\.[0-9\-]$', note) else [] ontologies = list(set(ecs + gos)) protDoc.gene = [protein.id] protDoc.ontologies = ontologies protDoc.alias = [protein.id] if len(protDoc.seq) > 30000: raise Exception("No existen proteinas tan largas...") protDoc.gene_id = gene_ids[protein.id] protDoc.organism = name protDoc.auth = str(BioMongoDB.demo_id) protDoc.seq_collection_id = seqCol prots.append(protDoc) if pbar.n and ((pbar.n % 1000) == 0): Protein.objects.insert(prots) prots = [] if prots: Protein.objects.insert(prots) _common_annotations(name, tmp_dir)
model_to_id = read_gene_annotation(args.gene_annotation) else: model_to_id = False #print model_to_id #exit() features = run(model_to_id, args.create_genes) '''Get gene filter if required''' if args.gene_filter: filt_genes = [x.strip("\n") for x in open(args.gene_filter)] else: filt_genes = False print len(features), "frameDP updates" in_file = args.input in_handle = open(in_file) rec_list = [] for rec in GFF.parse(in_handle): #print rec new_features = [] for gene in rec.features: if gene.id == "temp_gene_198": pass #print "orig",gene gene = parse_id(gene.id, features, gene, model_to_id, args.create_genes, filt_genes) if gene: for g in gene: if g.id == "Potri.007N006900": print g if g.id == "temp_gene_198": print g exit() if args.create_genes and gene:
def main(): """Main script body""" # Do a quick check of inputs gffs = sys.argv[1:] for gff in gffs: if not os.path.isfile(gff): sys.exit("Invalid input file specified: %s" % gff) # Open first input GFF if gffs[0].endswith('.gz'): fp = gzip.open(gffs[0]) else: fp = open(gffs[0]) # Create a list to store output entries combined = [] # Get GFF header and chromosome entries from first input file (it's the # same for all input files) for line in fp: if line.startswith("#") or "\tchromosome\t" in line: combined.append(line) # Reset read counter fp.seek(0) # Get chromosome entries for the first input file; as of TriTrypDB 29, the # GFF file no longer includes chromosome entries so we will use dicts # instead. chromosomes = {} for entry in GFF.parse(fp): if len(entry.features) > 0 and entry.features[0].type in [ 'chromosome', 'contig' ]: chromosomes[entry.id] = entry fp.close() # Add sites from all GFFs for gff in gffs: # Open input GFF if gff.endswith('.gz'): fp = gzip.open(gff) else: fp = open(gff) for entry in GFF.parse(fp): for feature in entry.features: # Add chromosome key if it doesn't already exist (needed for # TriTrypDB 29+) if entry.id not in chromosomes: chromosomes[entry.id] = Chromosome() chromosomes[entry.id].features.append(feature) # combined sites sites = {} # Sort and combine sites and add to results for ch_id in chromosomes: print("Parsing sites for %s" % ch_id) chromosomes[ch_id].features.sort() sites[ch_id] = {} for site in chromosomes[ch_id].features: # skip chromosomes if site.type in ['chromosome', 'contig']: continue # site info gene_id = site.qualifiers['Name'].pop() desc = ",".join(site.qualifiers['description']) score = int(site.qualifiers['score'].pop()) source = site.qualifiers['source'].pop() feature_type = site.type if gene_id not in sites[ch_id]: sites[ch_id][gene_id] = {} # new entry if site.location.end not in sites[ch_id][gene_id]: sites[ch_id][gene_id][site.location.end] = { 'Name': gene_id, 'source': source, 'type': feature_type, 'score': score, 'strand': site.strand, 'description': desc } else: # updating score sites[ch_id][gene_id][site.location.end]['score'] += score # Add combined rows for ch_id in sites: print("Combining sites for %s" % ch_id) # iterate over genes on each chromosome for gene_id in sites[ch_id]: site_counter = 1 # iterate over sites for gene for loc in sorted(sites[ch_id][gene_id].keys()): site = sites[ch_id][gene_id][loc] # feature type abbreviation if site['type'] == 'trans_splice_site': feature_type = 'sl' else: feature_type = 'polya' # assign a new site id site_id = "%s.%s.%d" % (site['Name'], feature_type, site_counter) # description desc = "ID=%s;Name=%s;description=%s" % (site_id, site['Name'], site['description']) # create output row strand = '+' if site['strand'] == 1 else '-' row = "\t".join([ ch_id, site['source'], site['type'], str(loc), str(loc), str(site['score']), strand, '.', desc ]) combined.append(row + "\n") site_counter += 1 # write combined gff #gff_suffix = commonsuffix(gffs).replace('.gz', '') gff_suffix = ".gff" outfile = "".join([os.path.commonprefix(gffs), 'combined', gff_suffix]) with open(outfile, 'w') as output: output.writelines(combined) print("Done!")
def GFF_to_dict(paths_in, gff_settings): '''Parse gff into dict: - feat_of_interest = what to look for in gff (protein_coding, tRNA, rRNA, etc) - name_qual = qualifier for alias/gene name (Name, gene_id) - name_qual_alt = alternative qualifier, if none, set as 'none' - biotype_qual = qualifier for type of feature (biotype, etc) These values must correspont to values in the GFF''' '''Unload gff_settings''' path_out = gff_settings['path_out'] feat_of_interest = gff_settings[ 'feat_of_interest'] #all, protein_coding, tRNA, rRNA name_qual = gff_settings['name_qual'] name_qual_alt = gff_settings['name_qual_alt'] remove_genes = gff_settings['remove_genes'] # aSD_seq = gff_settings['aSD_seq'] path_badgenes = paths_in['path_badgenes'] '''Output path can be defined, or use 0 to set as the annotation file for my main pipeline''' if path_out == 0: path_gff_dict = paths_in['path_gff_dict'] else: path_gff_dict = path_out '''Parse GFF using BCBio''' GFFgen = GFF.parse(paths_in['path_gff']) feat_num = 0 '''Define data arrays: will be used as columns for pandas DateFrame''' gff_dict = {} chromosome = [] aliaslist = [] startlist = [] stoplist = [] seqlist = [] typelist = [] strandlist = [] startcodon = [] stopcodon = [] SDaffinity = [] G_content = [] C_content = [] A_content = [] T_content = [] aa_code, codon_code = ribo_util.get_genetic_code() aa_comp_dict = {} '''Make list of bad genes''' # from Gene-Wei-Li # bad_genes = pd.read_csv(path_badgenes) # bad_genes = bad_genes.to_dict(orient='list') # bad_genes = bad_genes['GeneName'] '''Sift through GFF for relevant information''' for chromosome_number in range(1, 50): chr = next(GFFgen, None) if chr is None: break for feature in chr.features: chromosome_id = chr.id if feature.sub_features == []: feat_num += 1 continue if remove_genes == 'yes': '''Skip over non-CDS annotations''' if not feature.sub_features[0].type == feat_of_interest: feat_num += 1 continue elif feature.qualifiers.has_key('pseudo') == True: feat_num += 1 continue else: feature_type = 'CDS' else: '''Add feat type to GFF, noting pseudogenes''' if feature.qualifiers.has_key('pseudo') == True: feature_type = 'pseudo' else: feature_type = feature.sub_features[0].type '''Get feature name''' if name_qual in feature.qualifiers: feat_name = feature.qualifiers[name_qual][0] elif name_qual_alt in feature.qualifiers: feat_name = feature.qualifiers[name_qual_alt][0] else: feat_name = 'None' feat_num += 1 continue '''Remove feature if bad''' # if remove_genes == 'yes': # if feat_name in bad_genes: # feat_num+=1 # continue # else: # if feat_name in bad_genes: # feature_type = 'bad' '''Get start, end, and strand position''' start = feature.location.start.position end = feature.location.end.position strand = feature.strand '''Analyze features of interest for feat information''' alias = feat_name '''Each strand is treated differently, + strand == 1''' if strand == 1: '''I save gene sequence + 50 bp from each end: makes it easier to analyze start and stop sequence context without using whole genome sequence''' if start < 50: # if gene is near the beginning of genome sequence: sequence = 'N' * (50 - start ) # TB GFF starts at 0, add N * 50 sequence = sequence + chr[ 0:end + 50].seq # gene sequence + 50nt at each end else: sequence = chr[start - 50:end + 50].seq # gene sequence + 50nt at each end strand_val = '+' startcodon_pos = start stopcodon_pos = end - 1 if start > 200: upstream_seq = chr[start - 200:start + 100].seq else: '''For minus strand, 'end' is start codon, 'start' is stop codon and sequence is reverse compliment of gene sequence.''' sequence_rc = chr[start - 50:end + 50].seq sequence = sequence_rc.reverse_complement() strand_val = '-' startcodon_pos = end - 1 stopcodon_pos = start if end + 200 > len(chr.seq): upstream_seq = 'none' else: upstream_seq_rc = chr[end - 100:end + 200].seq upstream_seq = upstream_seq_rc.reverse_complement() sequence = str(sequence) start_codon = sequence[50:53:1] stop_codon = sequence[-53:-50] '''get sequence from start to stop for GC analysis''' CDS_seq = sequence[50:-50:1] G, C, A, T = GC_of_CDS(CDS_seq) '''Calculate SD affinity''' # SD_seq = sequence[30:50:1] # analyze 20 nt upstream of start codons # SD_affinity = shine_dalgarno_affinity(aSD_seq, SD_seq) '''Append data to lists''' if alias == 'trmD': print sequence chromosome.append(chromosome_id) typelist.append(feature_type) aliaslist.append(alias) seqlist.append(sequence) strandlist.append(strand_val) startlist.append(startcodon_pos) stoplist.append(stopcodon_pos) startcodon.append(start_codon) stopcodon.append(stop_codon) # SDaffinity.append(SD_affinity) G_content.append(G) C_content.append(C) A_content.append(A) T_content.append(T) feat_num += 1 '''Append lists to gff_dict''' gff_dict['Chromosome'] = chromosome gff_dict['Alias'] = aliaslist gff_dict['Strand'] = strandlist gff_dict['Start'] = startlist gff_dict['Stop'] = stoplist gff_dict['Sequence'] = seqlist gff_dict['Start_Codon'] = startcodon gff_dict['Stop_Codon'] = stopcodon gff_dict['Type'] = typelist # gff_dict['SD_affinity'] = SDaffinity gff_dict['G_content'] = G_content gff_dict['C_content'] = C_content gff_dict['A_content'] = A_content gff_dict['T_content'] = T_content '''Pickle dict for use later''' ribo_util.makePickle(gff_dict, path_gff_dict) '''print dataframe, and save as .csv for use later''' ## Print GFF to check gff_df = pd.DataFrame(gff_dict) display(gff_df) gff_df.to_csv(path_gff_dict + '.csv') return
def SD_affinity_genome(paths_in): '''This function takes octamers of genomic sequence and calculates shine dalgarno affinity: output is a size separated dict that can be used like a density dict.''' # load octamers SD_affinity = paths_in['SD_affinity'] affinity_list = pd.read_csv(SD_affinity) affinity_list = pd.Series(affinity_list.SD_affinity.values, index=affinity_list.Octamer).to_dict() length_range = range(10, 46) GFFgen = GFF.parse(paths_in['path_gff']) chr = GFFgen.next() feat_num = 0 affinity_plus = [] affinity_minus = [] density_plus_sizesep = {} density_minus_sizesep = {} sequence = chr.seq sequence_rc = sequence.reverse_complement() genome_size = len(sequence) position = 0 for position in range(0, genome_size): if position < 8: motif = 'AAAAAAAA' motif_rc = 'AAAAAAAA' elif genome_size - position < 8: motif_rc = 'AAAAAAAA' motif = 'AAAAAAAA' else: motif = sequence[position - 8:position].transcribe() motif_rc = sequence[position:position + 8].transcribe() motif_rc = motif_rc.reverse_complement() if len(motif) == 8 and len(motif_rc) == 8: SD_affinity_plus = affinity_list[motif] SD_affinity_minus = affinity_list[motif_rc] else: SD_affinity_plus = 0.0 SD_affinity_minus = 0.0 if position == 100000: print '100000' if position == 500000: print '500000' if position == 1000000: print '1000000' if position == 2000000: print '2000000' affinity_plus.append(SD_affinity_plus) affinity_minus.append(SD_affinity_minus) for length in length_range: density_plus_sizesep[length] = affinity_plus density_minus_sizesep[length] = affinity_minus path_den = inpath + 'density/density/SD1/' ribo_util.makePickle(density_plus_sizesep, path_den + "plus_sizesep") ribo_util.makePickle(density_minus_sizesep, path_den + "minus_sizesep") return
help="increase verbosity") parser.add_argument("-q", "--quiet", action="count", default=0, help="decrease verbosity") args = parser.parse_args() logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=(5 - args.verbose + args.quiet) * 10, datefmt="%H:%M:%S") try: from BCBio import GFF for record in GFF.parse(args.gff_file): break for gff_feature in record.features: print gff_feature print "_" * 80 feature = Feature(gff_feature, args.translation_file, 1, feature_definition_dir="features", qualifier_definition_dir="qualifiers") print "_" * 80 print feature break except Exception as e: import traceback
#!/usr/bin/env python import logging import sys from BCBio import GFF logging.basicConfig(level=logging.INFO) log = logging.getLogger() if __name__ == "__main__": attr = sys.argv[2] for record in GFF.parse(sys.argv[1]): if len(record.features) == 0: continue for feature in sorted(record.features, key=lambda x: x.location.start): # chrom chromStart chromEnd # name score strand # thickStart thickEnd itemRgb kv = { "strand": 0 if not feature.location.strand else feature.location.strand, "value": feature.qualifiers.get("score", [0])[0], } if attr not in feature.qualifiers: continue name = feature.qualifiers[attr][0]
def extract_pos(gff, filter=None): """ retrieves information from gff files, also is able to filter on certain genes. :param gff: :param filter: :return: """ print("Processing gencode annotation..") in_handle = open(gff) limit_info = dict(gff_type=['exon', 'CDS']) if filter: print('Filtering on: ', filter) if filter[0].startswith('#'): chrfilter = filter[0].split()[1].strip().split(',') limit_info['gff_id'] = ['chr' + (str(x)) for x in chrfilter] filter = filter[1::] genlist = set() for rec in GFF.parse(in_handle, limit_info=limit_info): for feature in rec.features: if feature.type == 'inferred_parent': for f in feature.sub_features: if containsTR(f.qualifiers['gene_name'][0], filter): if f.type == 'CDS' and 'chr' in rec.id: if containCDSGENE(genlist, f.qualifiers['gene_name'][0], 'gene'): genlist = discarditem( genlist, f.qualifiers['gene_name'][0]) genlist.add((f.qualifiers['gene_name'][0], (f.location.start + 1, f.location.end, f.location.strand), f.type, rec.id, f.qualifiers['exon_number'][0])) entries = [ genname for genname in genlist if genname[0] == f.qualifiers['gene_name'][0] ] for entry in entries: if entry[2] == 'exon': genlist.remove(entry) elif f.type == 'exon' and 'chr' in rec.id: if not containCDSGENE(genlist, f.qualifiers['gene_name'][0], 'CDS'): genlist.add( (f.qualifiers['gene_name'][0], (f.location.start + 1, f.location.end, f.location.strand), f.type, rec.id, f.qualifiers['exon_number'][0])) else: if containsTR(feature.qualifiers['gene_name'][0], filter): try: ex_number = feature.qualifiers['exon_number'][0] except: ex_number = 1 if feature.type == 'CDS' and 'chr' in rec.id: if containCDSGENE(genlist, feature.qualifiers['gene_name'][0], 'gene'): genlist = discarditem( genlist, feature.qualifiers['gene_name'][0]) genlist.add( (feature.qualifiers['gene_name'][0], (feature.location.start + 1, feature.location.end, feature.location.strand), feature.type, rec.id, ex_number)) entries = [ genname for genname in genlist if genname[0] == feature.qualifiers['gene_name'][0] ] for entry in entries: if entry[2] == 'exon': genlist.remove(entry) elif feature.type == 'exon' and 'chr' in rec.id: if not containCDSGENE( genlist, feature.qualifiers['gene_name'][0], 'CDS'): genlist.add((feature.qualifiers['gene_name'][0], (feature.location.start + 1, feature.location.end, feature.location.strand), feature.type, rec.id, ex_number)) #for item in genlist: # if "chr" not in item[3]: # genlist.remove(item) return sorted(genlist)
def load_features(reference, feature_names=None): #read in appropriately whether GFF or Genbank #checks explicitly for GFF otherwise assumes Genbank if not os.path.isfile(reference): print("ERROR: reference sequence not found. looking for", reference) return None features = {} if '.gff' in reference.lower(): #looks for 'gene' and 'gene' as best for TB try: from BCBio import GFF #Package name is confusing - tell user exactly what they need! except ImportError: print( "ERROR: Package BCBio.GFF not found! Please install using \'pip install bcbio-gff\' before re-running." ) return None limit_info = dict(gff_type=['gene']) with open(reference) as in_handle: for rec in GFF.parse(in_handle, limit_info=limit_info): for feat in rec.features: if feature_names is not None: #check both tags; user may have used either if "gene" in feat.qualifiers and feat.qualifiers[ "gene"][0] in feature_names: fname = feat.qualifiers["gene"][0] elif "locus_tag" in feat.qualifiers and feat.qualifiers[ "locus_tag"][0] in feature_names: fname = feat.qualifiers["locus_tag"][0] else: fname = None else: if "gene" in feat.qualifiers: fname = feat.qualifiers["gene"][0] else: fname = feat.qualifiers["locus_tag"][0] if fname: features[fname] = feat if feature_names is not None: for fe in feature_names: if fe not in features: print("Couldn't find gene {} in GFF or GenBank file". format(fe)) else: from Bio import SeqIO for feat in SeqIO.read(reference, 'genbank').features: if feat.type == 'CDS': if "locus_tag" in feat.qualifiers: fname = feat.qualifiers["locus_tag"][0] if feature_names is None or fname in feature_names: features[fname] = feat elif "gene" in feat.qualifiers: fname = feat.qualifiers["gene"][0] if feature_names is None or fname in feature_names: features[fname] = feat elif feat.type == 'source': #read 'nuc' as well for annotations - need start/end of whole! features['nuc'] = feat return features
logging.basicConfig(level=logging.INFO) log = logging.getLogger() if __name__ == "__main__": parser = argparse.ArgumentParser(description="Renumber genbank files") parser.add_argument("data", type=argparse.FileType("r"), help="Annotations") parser.add_argument("--filetype", type=str, help="Filetype") parser.add_argument("--new_name", type=str, help="New Name") args = parser.parse_args() # Iterate over our input data if args.filetype == "gff3": it = GFF.parse(args.data) else: it = SeqIO.parse(args.data, args.filetype) for idx, record in enumerate(it): # Can only handle a single name if idx > 1: raise Exception("Too many sequences") # Update name record.id = args.new_name record.name = args.new_name # Output according to type if args.filetype == "gff3": GFF.write([record], sys.stdout)
#NODE_10 1165936 1166465 # internal # ['Reference', ['2434777', '2434933', 'ancestral']], # ['2016-17702', ['2483546', '2485711', 'ancestral']], # ['2016-17705', ['2483546', '2485711', 'ancestral']], # ['2016-11776', ['2483546', '2485711', 'ancestral']], # ['2016-11778', ['2483546', '2485711', 'ancestral']], # ['2016-11777', ['2483546', '2485711', 'ancestral']], # ['2016-17701', ['2483546', '2485711', 'ancestral']], # ['Reference', ['2483546', '2485711', 'ancestral']]] if args.gubbins: msg('Parsing Gubbins file: {}'.format(coordfn)) with open(coordfn) as gff: for rec in GFF.parse(gff): for f in rec.features: taxa = f.qualifiers['taxa'][0].strip().split() state = ('ancestral', 'extant')[ len(taxa)==1 ] for taxon in taxa: seqLIST.append( [ taxon, [ int(f.location.start), int(f.location.end), state ] ] ) else: msg('Parsing ClonalFrameML file: {}'.format(coordfn)) with open(coordfn) as csvfile: RCseqs = csv.reader(csvfile, delimiter='\t') next(csvfile) for row in RCseqs: seq = row[0] RCstart = row[1] RCstop = row[2] node = t.search_nodes(name=seq)[0]
def armar_grafo_completo(): regiones = list( GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg")) scaffolds_dict = defaultdict(lambda: []) for c in regiones: for f in c.features: motif = f.sub_features[0].qualifiers["description"][0] if len(motifs) > 1: print set(motifs) scaffolds_dict[c.id].append({ "start": int(f.location.start), "end": int(f.location.end), "strand": f.location.strand, "tf": f.qualifiers["TF_id"][0], "motif": motif, "count": len(f.sub_features) }) ann = list(GFF.parse("/data/organismos/ILEX_PARA2/ncbi_IP4.gff3.whole")) tf_gene = [] # dist = defaultdict(lambda: {}) contig_map = {} for contig in ann: for f in contig.features: if f.type == "gene" and f.sub_features: contig_map[f.sub_features[0].id] = contig.id for contig in ann: if contig.id in scaffolds_dict: # for f1 in contig.features: # for f2 in contig.features: # if (f1.type == "gene" and f1.sub_features) and (f2.type == "gene" and f2.sub_features): # dist[f1.sub_features[0].id][f2.sub_features[0].id] = abs(f1.location.start - f2.location.start) for reg in scaffolds_dict[contig.id]: if reg["strand"] == 1: genes = [ x for x in contig.features if int(x.location.start + 100) >= reg["end"] if x.type == "gene" and x.sub_features ] gene = sorted( genes, key=lambda x: x.location.start)[0].sub_features[0] else: genes = [ x for x in contig.features if (x.location.end - 100) <= reg["start"] if x.type == "gene" and x.sub_features ] try: gene = sorted( genes, key=lambda x: x.location.end)[-1].sub_features[0] except: print([contig.id, reg["start"]]) tf_gene.append({ "scaffold": contig.id, "tf": reg["tf"], "motif": reg["motif"], "count": reg["count"], "gene": gene.id }) nodes = {} edges = defaultdict(lambda: []) tfs = {} for row in tf_gene: nodes[row["tf"]] = 1 tfs[row["tf"]] = [row["motif"], row["count"]] nodes[row["gene"]] = 1 edges[row["tf"]].append(row["gene"]) diG = nx.DiGraph() for node, scaffold in nodes.items(): node = node.replace("*", "") if node == "unknown": continue diG.add_node(node, scaffold=contig_map[node], count=tfs[node][1] if node in tfs else "", motif=tfs[node][0] if node in tfs else "") for ft, genes in edges.items(): ft = ft.replace("*", "") if ft == "unknown": continue for regulated in genes: if regulated == "unknown": continue regulated = regulated.replace("*", "") diG.add_edge(ft, regulated) assert "unknown" not in diG nx.write_gml(diG, "/data/organismos/ILEX_PARA2/regulation/graph.gml")
def agrupar_sitios(): regiones = list( GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg")) ids = 1 groups = {} for c in tqdm(regiones): groups[c.id] = [] for strand in [1, -1]: group = SeqFeature(id=c.features[0], type="grouped_transcription_regulatory_region", location=c.features[0].location) group.sub_features = [] fs = sorted([f for f in c.features if f.strand == strand], key=lambda x: x.location.start) if not fs: continue group.sub_features += [fs[0]] for f in fs[1:]: end = max([x.location.end for x in group.sub_features]) if ((abs(f.location.start - end) < 1500) or (set(range(f.location.start, f.location.end)) & set( range(group.sub_features[-1].location.start, group.sub_features[-1].location.end)))): group.sub_features.append(f) else: group.qualifiers = { "description": "_".join( sorted( set([ x.qualifiers["description"][0].split( " regulatory region")[0] for x in group.sub_features ]))), "ID": ["ILEXPARARR" + str(ids)] } ids += 1 group.location = FeatureLocation( start=min( [x.location.start for x in group.sub_features]), end=max([x.location.end for x in group.sub_features]), strand=f.location.strand) assert group.location.start < group.location.end if (group.location.end - group.location.start) > 5000: print(group.qualifiers["ID"]) groups[c.id].append(group) group = SeqFeature( id=c.features[0], type="grouped_transcription_regulatory_region", location=f.location) group.sub_features = [f] if group: group.qualifiers = { "description": "_".join( sorted( set([ x.qualifiers["description"][0].split( " binding site")[0] for x in group.sub_features ]))), "ID": ["ILEXPARARR" + str(ids)] } ids += 1 group.location = FeatureLocation( start=min([x.location.start for x in group.sub_features]), end=max([x.location.end for x in group.sub_features]), strand=f.location.strand) assert group.location.start < group.location.end if (group.location.end - group.location.start) > 5000: print(group.qualifiers["ID"]) groups[c.id].append(group) # for _, v in groups.items(): # for x in v: # x.sub_features = [] records = [ SeqRecord(id=k, name="", description="", seq=Seq(""), features=v) for k, v in groups.items() ] GFF.write(tqdm(records), open("/data/organismos/ILEX_PARA2/regulation/grouped.gff", "w"))
def AreInitSitesUsed(): hitlist = [] with open( '/users/buskirk/documents/profiling/projects/sORFs/init_sites.csv', 'rU') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: hitlist.append(row) GFFgen = GFF.parse( '/users/buskirk/documents/profiling/GFF/MG1655/coli3.gff') chrom = GFFgen.next() genes_plus = geneplot(chrom, 1) genes_minus = geneplot(chrom, -1) pathi = "/users/buskirk/documents/profiling/projects/sORFs/wigfiles/" filelist = ['r_control', 'o_control'] for fname in filelist: density_filestring = pathi + fname counts_plus = readwig(density_filestring + "_plus") counts_minus = readwig(density_filestring + "_minus") for iH in hitlist: if iH[0] == 'start': iH.append(fname) continue start = int(iH[0]) - 1 # back to 0 based list stop = int(iH[1]) - 1 rpm = 0.0 counter = 0 skip = False if iH[7] == 'plus': for iL in range(start - 15, stop + 1 + 15): if iL >= len(genes_plus): skip = True break if genes_plus[ iL] != '0': # no overlap for 15 nt on either side of new sORF allowed rpkm = genes_plus[iL] skip = True break if not skip: for iP in range(start, stop + 1): # + 1 to include last nt in stop codon rpm += counts_plus[iP + 15] counter += 1 elif iH[7] == 'minus': for iL in range( start - 15, stop + 1 + 15): # start is left and stop right in chromosome if genes_minus[iL] != '0': rpkm = genes_minus[iL] skip = True break if not skip: for iP in range(start, stop + 1): rpm += counts_minus[iP - 15] counter += 1 if counter != 0: rpkm = rpm * 1000 / counter iH.append(rpkm) writelisttoexcel( hitlist, '/users/buskirk/documents/profiling/projects/sORFs/init_sites_rpkm')
def load_legacy_gff3(self, organism, gff3, source=None): """ Load a full GFF3 into annotation track (legacy version, kept for compatibility only) :type organism: str :param organism: Organism Common Name :type gff3: str :param gff3: GFF3 to load :type source: str :param source: URL where the input dataset can be found. :rtype: str :return: Loading report """ sys.stdout.write('# ') sys.stdout.write('\t'.join( ['Feature ID', 'Apollo ID', 'Success', 'Messages'])) sys.stdout.write('\n') bad_quals = [ 'date_creation', 'source', 'owner', 'date_last_modified', 'Name', 'ID' ] for rec in GFF.parse(gff3): self.set_sequence(organism, rec.id) for feature in rec.features: # We can only handle genes right now if feature.type not in ('gene', 'terminator'): continue # Convert the feature into a presentation that Apollo will accept feature_data = features_to_feature_schema([feature]) # TODO: do we handle all top-types here? if 'children' in feature_data[0] and any([ child['type']['name'] == 'tRNA' for child in feature_data[0]['children'] ]): # We're experiencing a (transient?) problem where gene_001 to # gene_025 will be rejected. Thus, hardcode to a known working # gene name and update later. feature_data[0]['name'] = 'tRNA_000' tRNA_sf = [ child for child in feature.sub_features if child.type == 'tRNA' ][0] tRNA_type = 'tRNA-' + tRNA_sf.qualifiers.get( 'Codon', ["Unk"])[0] if 'Name' in feature.qualifiers: if feature.qualifiers['Name'][0].startswith('tRNA-'): tRNA_type = feature.qualifiers['Name'][0] newfeature = self.add_feature(feature_data[0]) def func0(): self.set_name( newfeature['features'][0]['uniquename'], tRNA_type, ) retry(func0) if source: gene_id = newfeature['features'][0]['parent_id'] def setSource(): self.add_attribute(gene_id, 'DatasetSource', source) retry(setSource) sys.stdout.write('\t'.join([ feature.id, newfeature['features'][0]['uniquename'], 'success', ])) if feature_data[0]['type']['name'] == 'terminator': # We're experiencing a (transient?) problem where gene_001 to # gene_025 will be rejected. Thus, hardcode to a known working # gene name and update later. feature_data[0]['name'] = 'terminator_000' newfeature = self.add_feature(feature_data[0]) def func0(): self.set_name(newfeature['features'][0]['uniquename'], 'terminator') retry(func0) if source: gene_id = newfeature['features'][0]['parent_id'] def setSource(): self.add_attribute(gene_id, 'DatasetSource', source) retry(setSource) sys.stdout.write('\t'.join([ feature.id, newfeature['features'][0]['uniquename'], 'success', ])) else: try: # We're experiencing a (transient?) problem where gene_001 to # gene_025 will be rejected. Thus, hardcode to a known working # gene name and update later. feature_data[0]['name'] = 'gene_000' # Create the new feature newfeature = self.add_feature(feature_data[0]) # Extract the UUIDs that apollo returns to us mrna_id = newfeature['features'][0]['uniquename'] gene_id = newfeature['features'][0]['parent_id'] # Sleep to give it time to actually persist the feature. Apollo # is terrible about writing + immediately reading back written # data. time.sleep(1) # Extract CDS feature from the feature data, this will be used # to set the CDS location correctly (apollo currently screwing # this up (2.0.6)) min_cds = None max_cds = None for feat in feature_data[0]['children']: # mRNA level for subfeat in feat['children']: # Can be exon or CDS if subfeat['type']['name'] == 'CDS': if min_cds is None: min_cds = subfeat['location']['fmin'] max_cds = subfeat['location']['fmax'] else: min_cds = min( min_cds, subfeat['location']['fmin']) max_cds = max( max_cds, subfeat['location']['fmax']) if 'children' in subfeat: for subsubfeat in subfeat['children']: if subsubfeat['type']['name'] == 'CDS': if min_cds is None: min_cds = subsubfeat[ 'location']['fmin'] max_cds = subsubfeat[ 'location']['fmax'] else: min_cds = min( min_cds, subsubfeat['location'] ['fmin']) max_cds = max( max_cds, subsubfeat['location'] ['fmax']) # Correct the translation start, but with strand specific log if feature_data[0]['location']['strand'] == 1: self.set_translation_start(mrna_id, min(min_cds, max_cds)) else: self.set_translation_start( mrna_id, max(min_cds, max_cds) - 1) # Finally we set the name, this should be correct. def func(): self.set_name( mrna_id, feature.qualifiers.get( 'product', feature.qualifiers.get( 'Name', ["Unknown"]))[0]) retry(func) def func(): self.set_name( gene_id, feature.qualifiers.get( 'product', feature.qualifiers.get( 'Name', ["Unknown"]))[0]) retry(func) if source: gene_id = newfeature['features'][0]['parent_id'] def setSource(): self.add_attribute(gene_id, 'DatasetSource', source) retry(setSource) extra_attr = {} for (key, values) in feature.qualifiers.items(): if key in bad_quals: continue if key == 'Note': def func2(): self.add_comments(gene_id, values) retry(func2) else: extra_attr[key] = values for key in extra_attr: def func3(): self.add_attribute(gene_id, key, extra_attr[key]) retry(func3) sys.stdout.write('\t'.join([ feature.id, gene_id, 'success', ])) except Exception as e: msg = str(e) if '\n' in msg: msg = msg[0:msg.index('\n')] sys.stdout.write('\t'.join( [feature.id, '', 'ERROR', msg])) sys.stdout.write('\n') sys.stdout.flush()
# Set static parameters method = 'genebuild' pANDe = ROOT.split('/')[-1] primary_assembly, ensembl_build = pANDe.split('.') # Create an accession to name dictionary ac_to_name_list = [] ac_to_name = {} name_to_ac = {} alignment_file = os.path.join(ALIGNMENTS, 'Homo_sapiens.%s.%s.chr_patch_hapl_scaff.gff3') % ( primary_assembly, ensembl_build) aln_handle = open(alignment_file) # Limiter which identifies the Ensembl tagged lines only (Note Capital E) limit_info = dict(gff_source=["GRCh37"]) for rec in GFF.parse(aln_handle, limit_info=limit_info): ac_to_name_list.append(rec.features[0].qualifiers.get('Alias')) # MT accession is not provided in the GFF file # Contig is 16569 nt so NC_012920.1 ac_to_name_list.append(['', 'NC_012920.1']) for a_n in ac_to_name_list: if a_n is None: continue # Need to work back from the refseq Accession workback = name_to_accession.to_name(a_n[-1]) a_n = [workback] + a_n # Filter out primary chromosomes try: ac_to_name[a_n[2]] = str(a_n[0]).replace('chr', '') name_to_ac[str(a_n[0]).replace('chr', '')] = str(a_n[2])
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", help="Gff file with annotations to extract") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file with information about transcripts") parser.add_argument("-l", "--longest_isoforms", action="store", dest="longest_isoforms", help="File to write longest isoforms") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") for record in GFF.parse(open(args.input_gff)): for feature in record.features: #print feature if feature.type == "gene": transcript_id_list = [] transcript_len_list = [] CDS_len_list = [] for subfeature in feature.sub_features: #print subfeature #print(subfeature.type) if subfeature.type == "mRNA" or subfeature.type == "transcript": transcript_id_list.append(subfeature.id) transcript_len_list.append(len(subfeature)) CDS_len = 0 for subsubfeature in subfeature.sub_features: if subsubfeature.type == "CDS":