def make_index(gff_file): """ Make a sqlite database for fast retrieval of features. """ import GFFutils db_file = gff_file + ".db" if need_update(gff_file, db_file): if op.exists(db_file): os.remove(db_file) GFFutils.create_gffdb(gff_file, db_file) return GFFutils.GFFDB(db_file)
def to_sam(self, fasta): """ Prints items as SAM lines """ s = [] genome = GFFutils.Genome(fasta) for item in self.items: s.append(item.to_sam(genome)) return ''.join(s)
def to_fastq(self, fasta): """ Creates sequences and fake quality scores. Sequence names are the same as the GFFFeature.id. """ genome = GFFutils.Genome(fasta) s = [] for item in self.items: s.append(item.to_fastq(genome)) return ''.join(s)
def main(gff_file, fasta_file, parents, children): db_file = gff_file + ".db" if not op.exists(db_file): GFFutils.create_gffdb(gff_file, db_file) f = Fasta(fasta_file) g = GFFutils.GFFDB(db_file) parents = set(parents.split(',')) parents_iter = [g.features_of_type(x) for x in parents] parents_list = itertools.chain(*parents_iter) children_list = set(children.split(',')) for feat in parents_list: children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand=='-': children.reverse() feat_seq = ''.join(x[0] for x in children) print ">%s" % feat.id print feat_seq
def main(gff_file, fasta_file, parents, children): db_file = gff_file + ".db" if not op.exists(db_file): GFFutils.create_gffdb(gff_file, db_file) f = Fasta(fasta_file) g = GFFutils.GFFDB(db_file) parents = set(parents.split(',')) parents_iter = [g.features_of_type(x) for x in parents] parents_list = itertools.chain(*parents_iter) children_list = set(children.split(',')) for feat in parents_list: children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence( dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand == '-': children.reverse() feat_seq = ''.join(x[0] for x in children) print ">%s" % feat.id print feat_seq
#!/usr/bin/python import sys import numpy as np np.set_printoptions(threshold=np.inf) #from itertools import count, tee, izip, islice import GFFutils import pyBigWig import matplotlib.pyplot as plt G = GFFutils.GFFDB("/home/user/dm1.db") C_bw = pyBigWig.open("/home/user/Symb_treatedVscontrol_50bin.bw") Gene_final = [] def separate_exon_intron(EI_list, EI1, EI2, EI3): for idx, ele in enumerate(EI_list): ele = len(ele) if int(idx) == 0: EI1.append(ele) elif int(idx) == 1: EI2.append(ele) elif int(idx) == 2: EI3.append(ele) exon_len = [] upstream_3000, exon1, intron1, exon2, intron2, exon3,intron3, after_1000, before_1000,intron_3,exon_3, intron_2, exon_2, intron_1, exon_1, downstream_3000 = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[] for mRNA in G.features_of_type('mRNA'): exons = list(G.children(mRNA, featuretype='exon'))
#!/usr/bin/python usage = """ Create a GFF database from a GFF file, e.g., downloaded from FlyBase. Ryan Dale 2010 ([email protected]) """ import GFFutils import optparse import os import sys op = optparse.OptionParser(usage=usage) op.add_option('--gff',dest='gff',help='Input GFF file') op.add_option('--gffdb',dest='gffdb',help='Destination GFF database file') options,args = op.parse_args() if not options.gff or not options.gffdb: op.print_help() print '\nERROR: Please specify an input GFF file and an output GFF database file' sys.exit() if not os.path.exists(options.gff): print 'GFF file %s does not exist!' % options.gff sys.exit() GFFutils.create_gffdb(options.gff, options.gffdb)
import GFFutils G = GFFutils.GFFDB('dm3.db') exon_len = [] exon1 = [] exon2 = [] exon3 = [] exon_3 = [] exon_2 = [] exon_1 = [] intron1 = [] intron2 = [] intron3 = [] intron_3 = [] intron_2 = [] intron_1 = [] gene_count = 0 for mRNA in G.features_of_type('mRNA'): # print(mRNA) exons = list(G.children(mRNA, featuretype='exon')) introns = list(G.interfeatures(exons)) if mRNA.strand == "-": first_3_exons = exons[-3:] last_3_exons = exons[:3] first_3_introns = introns[-3:] last_3_introns = introns[:3] else: first_3_exons = exons[:3] last_3_exons = exons[-3:] first_3_introns = introns[:3]
Optionally adds a "chr" to the beginning of each chromosome. Writes to stdout. """ op = optparse.OptionParser(usage=usage) op.add_option('--addchr', action='store_true', help='Prefix each chromosome with "chr" in the output file.') options, args = op.parse_args() gfffn = args[0] # this is a list of featuretypes that you want to remove. culled_features = [ 'orthologous_to', 'pcr_product', 'BAC_cloned_genomic_insert' ] f = GFFutils.GFFFile(gfffn) for feature in f: if (feature.start is None) or (feature.stop is None): continue if (feature.start < 0) or (feature.stop < 0) or (feature.start > feature.stop): continue if feature.featuretype in culled_features: continue if options.addchr: feature.chr = 'chr' + feature.chr sys.stdout.write(feature.tostring())
g10.parse(gene_models4) gene_models5 = gene_models5.splitlines(True) g11 = GenomeModel(chrom_start=1, scalar=5, read_length=3, debug=False) g11.parse(gene_models5) gene_models6 = gene_models6.splitlines(True) g12 = GenomeModel(chrom='chr3R', chrom_start=101, scalar=5, read_length=3, debug=False) g12.parse(gene_models6) here = os.path.dirname(__file__) genome = GFFutils.Genome(os.path.join(here, 'data/dm3.chr2L.oneline.fa')) def feature_exists(genome_model_obj, start, stop, featuretype, chrom='chr2L', strand='+'): # Checks to see if a feature exists. Does not check names, only genomic # coords and featuretype for feature in genome_model_obj.features: if (feature.start == start) and (feature.stop == stop) \ and (feature.chrom == chrom) and (feature.strand == strand) \ and (feature.featuretype == featuretype): return True