def rebase(parent, child, interpro=False, protein2dna=False): child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): # TODO, replace with recursion in case it's matched against a # non-parent feature. We're cheating a bit here right now... replacement_features = [] for feature in rec.features: if feature.id in child_features: new_subfeatures = child_features[feature.id] # TODO: update starts fixed_subfeatures = [] for x in new_subfeatures: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_subfeatures.append(x) replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features GFF.write([rec], sys.stdout)
def t_write_from_recs(self): """Write out GFF3 from SeqRecord inputs. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} sub_qualifiers = {"source": "prediction"} top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1, qualifiers=sub_qualifiers), SeqFeature(FeatureLocation(15, 20), type="exon", strand=1, qualifiers=sub_qualifiers)] rec.features = [top_feature] out_handle = StringIO.StringIO() GFF.write([rec], out_handle) wrote_info = out_handle.getvalue().split("\n") assert wrote_info[0] == "##gff-version 3" assert wrote_info[1] == "##sequence-region ID1 1 20" assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.', 'other=Some,annotations;ID=gene1'] assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5', '.', '+', '.', 'Parent=gene1']
def main(): from argparse import ArgumentParser parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF") parser.add_argument("sam_filename") parser.add_argument("-i", "--input_fasta", default=None, help="(Optional) input fasta. If given, coverage will be calculated.") parser.add_argument("-s", "--source", required=True, help="source name (ex: hg38, mm10)") args = parser.parse_args() if not args.sam_filename.endswith('.sam'): print >> sys.stderr, "Only accepts files ending in .sam. Abort!" sys.exit(-1) prefix = args.sam_filename[:-4] output_gff3 = prefix + '.gff3' q_dict = None if args.input_fasta is not None: q_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(args.input_fasta), 'fasta')) with open(output_gff3, 'w') as f: recs = [convert_sam_rec_to_gff3_rec(r0, args.source) for r0 in GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict)] BCBio_GFF.write(filter(lambda x: x is not None, recs), f) print >> sys.stderr, "Output written to {0}.".format(output_gff3)
def to_gff_file(self, file): """ Export to GFF format, saving to provided file like object. """ records = [] for fragment in self.__genome.fragments.all(): fragment = fragment.indexed_fragment() seq = Seq(fragment.sequence) rec = SeqRecord(seq, "%s" % (fragment.name,)) features = [] for annotation in fragment.annotations(): # FeatureLocation first bp is AfterPosition, so -1 loc = FeatureLocation(annotation.base_first - 1, annotation.base_last) qualifiers = {'name': annotation.feature.name} strand = annotation.feature.strand feature = SeqFeature(loc, type=annotation.feature.type, strand=0 if strand is None else strand, qualifiers=qualifiers) features.append(feature) rec.features = features records.append(rec) GFF.write(records, file, include_fasta=True)
def to_gff(self, filename): """ Export to GFF format, saving to the specified filename. """ records = [] for fragment in self.__genome.fragments.all(): fragment = fragment.indexed_fragment() seq = Seq(fragment.sequence) rec = SeqRecord(seq, "%s" % (fragment.name,)) features = [] for annotation in fragment.annotations(): # FeatureLocation first bp is AfterPosition, so -1 loc = FeatureLocation(annotation.base_first-1, annotation.base_last) qualifiers = {'name': annotation.feature.name} feature = SeqFeature(loc, type=annotation.feature.type, strand=1, qualifiers=qualifiers) features.append(feature) rec.features = features records.append(rec) with open(filename, "w") as out_handle: GFF.write(records, out_handle, include_fasta=True)
def CpGIslandsToGFF(island_location): # Output methylation regions (CpG Islands, namely) to a GFF3 compliant file out_file = os.getcwd() \ + '/' \ + os.path.splitext(base)[0] \ + '.gff' seq = cur_record.seq rec = SeqRecord(seq, "ID1") qualifiers = {"source": "bssimulation", "score": '.', "ID": cur_record.name} sub_qualifiers = {"source": "bssimulation"} top_feature = SeqFeature(FeatureLocation(0, len(cur_record)), type="region", strand=0, qualifiers=qualifiers) for i in island_location: begin = int(i[0] - i[1]/2) end = int(i[0] + i[1]/2) top_feature.sub_features.append(SeqFeature(FeatureLocation(begin, end), type="CpG_island", strand=0, qualifiers=sub_qualifiers)) rec.features = [top_feature] with open(out_file, "w") as out_handle: GFF.write([rec], out_handle)
def rebase(parent, child, interpro=False, protein2dna=False): child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): replacement_features = [] for feature in feature_lambda( rec.features, feature_test_qual_value, { 'qualifier': 'ID', 'attribute_list': child_features.keys(), }, subfeatures=False): new_subfeatures = child_features[feature.id] fixed_subfeatures = [] for x in new_subfeatures: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_subfeatures.append(x) replacement_features.extend(fixed_subfeatures) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features rec.annotations = {} GFF.write([rec], sys.stdout)
def genbank_to_gff(self, genbank_file): from Bio import SeqIO from BCBio import GFF gff_file = "%s.gff" % (os.path.splitext(genbank_file)[0],) with open(gff_file, "w") as out_handle: GFF.write(SeqIO.parse(genbank_file, "genbank"), out_handle, include_fasta=True) return dict(gff_file=gff_file)
def main(gb_file,include_fasta=None): out_file = "%s.gff" % os.path.splitext(gb_file)[0] inc_fasta = False if include_fasta is not None: if include_fasta.lower() in ("true","yes","1"): inc_fasta = True with open(out_file, "w") as out_handle: GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle, inc_fasta)
def embl2gff(dat, org, gff): """ Parse embl file and estract mature miRNA location information. """ # extract records dat_parser = SeqIO.parse(dat, "embl") # extract organism specific miRNAs org_mirnas = [mirna for mirna in dat_parser if mirna.name.startswith(org)] for mirna in org_mirnas: mirna.id = mirna.name GFF.write(org_mirnas, gff)
def genbank_to_gff(gb_file): """Convert GenBank file to GFF for IGV display. """ max_size = 1e4 gff_file = "%s.gff3" % os.path.splitext(gb_file)[0] if not os.path.exists(gff_file): with open(gb_file) as in_handle: with open(gff_file, "w") as out_handle: gb_iterator = SeqIO.parse(in_handle, "genbank") GFF.write(_filter_features(gb_iterator, max_size), out_handle)
def to_GFF(args): """ Convert a GenBank or EMBL file to GFF Biopython does not natively support GFF Can be useful for QUAST (Quality Assessment Tool for Genome Assemblies) :param args: an argparse args list """ in_type = args.inFormat.lower() with open(args.input) as fin, open(args.output, "w") as fout: GFF.write(SeqIO.parse(fin, in_type), fout)
def t_write_seqrecord(self): """Write single SeqRecords. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)] out_handle = StringIO.StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") gff_line = wrote_info[2] assert gff_line.split("\t")[0] == "ID1"
def t_gff3_to_gff3(self): """Read in and write out GFF3 without any loss of information. """ recs = SeqIO.to_dict(GFF.parse(self._test_gff_file)) out_handle = StringIO.StringIO() GFF.write(recs.values(), out_handle) wrote_handle = StringIO.StringIO(out_handle.getvalue()) recs_two = SeqIO.to_dict(GFF.parse(wrote_handle)) orig_rec = recs.values()[0] re_rec = recs.values()[0] assert len(orig_rec.features) == len(re_rec.features) for i, orig_f in enumerate(orig_rec.features): assert str(orig_f) == str(re_rec.features[i])
def t_write_fasta(self): """Include FASTA records in GFF output. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} rec.features = [SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers)] out_handle = StringIO.StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") fasta_parts = wrote_info[3:] assert fasta_parts[0] == "##FASTA" assert fasta_parts[1] == ">ID1 <unknown description>" assert fasta_parts[2] == str(seq)
def main(): for j in glob.glob("*.gb"): #from BCBio import GFF #from Bio import SeqIO fname=str(j).split(".")[0] in_file = j out_file = str(fname)+".gff" in_handle = open(in_file) out_handle = open(out_file, "w") GFF.write(SeqIO.parse(in_handle, "genbank"), out_handle) in_handle.close() out_handle.close() print "converted..."+str(j)
def export(org_cn, seqs): org_data = wa.organisms.findOrganismByCn(org_cn) data = io.StringIO() kwargs = dict( exportType='GFF3', seqType='genomic', exportGff3Fasta=True, output="text", exportFormat="text", organism=org_cn, ) if len(seqs) > 0: data.write(wa.io.write( exportAllSequences=False, sequences=seqs, **kwargs ).encode('utf-8')) else: data.write(wa.io.write( exportAllSequences=True, sequences=[], **kwargs ).encode('utf-8')) # Seek back to start data.seek(0) records = list(GFF.parse(data)) if len(records) == 0: print("Could not find any sequences or annotations for this organism + reference sequence") sys.exit(2) else: for record in records: record.annotations = {} record.features = sorted(record.features, key=lambda x: x.location.start) if args.gff: GFF.write([record], args.gff) record.description = "" if args.fasta: SeqIO.write([record], args.fasta, 'fasta') return org_data
def gb2gff(gbname): """ suppose the gb file end as prefix.gb write prefix.fasta and prefix.gff file out usage: python gb2gff.py "nigoni.gb" """ prefix=gbname.replace(".gb","") out_gff=open((prefix+".gff"),"w") out_fasta=open(prefix+".fasta","w") with open(gbname) as in_handle: GFF.write(SeqIO.parse(in_handle,"genbank"),out_gff) with open(gbname) as in_handle: # have to reopen the file count =SeqIO.write(SeqIO.parse(in_handle,"genbank"),out_fasta,"fasta") out_gff.close() out_fasta.close() print("Converted %i records" % count)
def parse_gff(id_start_end, gff3): ids = {} for line in id_start_end: l = line.split() if l[0] in ids: ids[l[0]].append((int(l[1]), int(l[2]))) else: ids[l[0]] = [(int(l[1]), int(l[2]))] for rec in GFF.parse(gff3): locs = ids[rec.id] feats = [] for feat in rec.features: f_loc = (feat.location.start, feat.location.end) for loc in locs: if (min(f_loc) <= max(loc)) and (max(f_loc) >= min(loc)): feats.append(feat) rec.features = feats GFF.write([rec], sys.stdout)
def gb2gff(gbname): """ suppose the gb file end as prefix.gb write prefix.fasta and prefix.gff file out usage: python gb2gff.py "nigoni.gb" """ prefix = gbname.replace(".gb", "") out_gff = open((prefix + ".gff"), "w") out_fasta = open(prefix + ".fasta", "w") with open(gbname) as in_handle: GFF.write(SeqIO.parse(in_handle, "genbank"), out_gff) with open(gbname) as in_handle: # have to reopen the file count = SeqIO.write(SeqIO.parse(in_handle, "genbank"), out_fasta, "fasta") out_gff.close() out_fasta.close() print("Converted %i records" % count)
def t_gff2_to_gff3(self): """Read in GFF2 and write out as GFF3. """ recs = SeqIO.to_dict(GFF.parse(self._wormbase_file)) out_handle = StringIO.StringIO() GFF.write(recs.values(), out_handle) wrote_handle = StringIO.StringIO(out_handle.getvalue()) # check some tricky lines in the GFF2 file checks = 0 for line in wrote_handle: if line.find("Interpolated_map_position") >= 0: checks += 1 assert line.find("RFLP=No") > 0 if line.find("Gene=WBGene00000138") > 0: checks += 1 assert line.find("ID=B0019.1") > 0 if line.find("translated_nucleotide_match\t12762127") > 0: checks += 1 assert line.find("Note=MSP:FADFSPLDVSDVNFATDDLAK") > 0 assert checks == 3, "Missing check line"
def genbank2gff(inputfile): """ This function transfer genbank file to gff file * inputfile: str. File name ends with gbk or gb return filename.gff """ handle = open(inputfile, "r") if inputfile.endswith("gb"): out = inputfile[:-3] + ".gff" else: out = inputfile[:-4] + ".gff" out_handle = open(out, "w") result = SeqIO.parse(handle, "genbank") GFF.write(result, out_handle) handle.close() out_handle.close() return out
def write_gff_cluster(clusters, header, output_path, sample_name='sample', threads=1): """Write clusters as GFF entries.""" with open(output_path, "w") as out_handle: tp = ThreadPoolExecutor(threads) futures = [] records = OrderedDict((tid, SeqRecord(Seq(""), sn)) for tid, sn in enumerate(header.references)) for i, cluster in enumerate(clusters): if not cluster.exclude: func = partial(get_feature, cluster, sample_name, i) futures.append(tp.submit(func)) for future in futures: tid, feature = future.result() records[tid].features.append(feature) GFF.write(records.values(), out_handle) tp.shutdown(wait=True)
def t_gff2_to_gff3(self): """Read in GFF2 and write out as GFF3. """ recs = SeqIO.to_dict(GFF.parse(self._wormbase_file)) out_handle = StringIO.StringIO() GFF.write(recs.values(), out_handle) wrote_handle = StringIO.StringIO(out_handle.getvalue()) # check some tricky lines in the GFF2 file checks = 0 for line in wrote_handle: if line.find("Interpolated_map_position") >= 0: checks += 1 assert line.find("RFLP=No") > 0 if line.find("Gene=WBGene00000138") > 0: checks += 1 assert line.find("ID=B0019.1") > 0 if line.find("translated_nucleotide_match\t12762127") > 0: checks += 1 assert line.find("Note=MSP%3AFADFSPLDVSDVNFATDDLAK") > 0 assert checks == 3, "Missing check line"
def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'): # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): replacement_features = [] # Horrifically slow I believe for feature in feature_lambda( rec.features, # Filter features in the parent genome by those that are # "interesting", i.e. have results in child_features array. # Probably an unnecessary optimisation. feature_test_qual_value, { 'qualifier': map_by, 'attribute_list': child_features.keys(), }, subfeatures=False): # Features which will be re-mapped to_remap = child_features[feature.id] # TODO: update starts fixed_features = [] for x in to_remap: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except: pass fixed_features.append(x) replacement_features.extend(fixed_features) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features rec.annotations = {} GFF.write([rec], sys.stdout)
def t_write_from_recs(self): """Write out GFF3 from SeqRecord inputs. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = { "source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1" } sub_qualifiers = {"source": "prediction"} top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) top_feature.sub_features = [ SeqFeature(FeatureLocation(0, 5), type="exon", strand=1, qualifiers=sub_qualifiers), SeqFeature(FeatureLocation(15, 20), type="exon", strand=1, qualifiers=sub_qualifiers) ] rec.features = [top_feature] out_handle = StringIO() GFF.write([rec], out_handle) wrote_info = out_handle.getvalue().split("\n") assert wrote_info[0] == "##gff-version 3" assert wrote_info[1] == "##sequence-region ID1 1 20" print(wrote_info[2].split("\t")) assert wrote_info[2].split("\t") == [ 'ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.', 'ID=gene1;other=Some,annotations' ] assert wrote_info[3].split("\t") == [ 'ID1', 'prediction', 'exon', '1', '5', '.', '+', '.', 'Parent=gene1' ]
def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'): # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): replacement_features = [] for feature in feature_lambda( rec.features, # Filter features in the parent genome by those that are # "interesting", i.e. have results in child_features array. # Probably an unnecessary optimisation. feature_test_qual_value, { 'qualifier': map_by, 'attribute_list': child_features.keys(), }, subfeatures=False): # Features which will be re-mapped to_remap = child_features[feature.id] # TODO: update starts fixed_features = [] for x in to_remap: # Then update the location of the actual feature __update_feature_location(x, feature, protein2dna) if interpro: for y in ('status', 'Target'): try: del x.qualifiers[y] except Exception: pass fixed_features.append(x) replacement_features.extend(fixed_features) # We do this so we don't include the original set of features that we # were rebasing against in our result. rec.features = replacement_features rec.annotations = {} GFF.write([rec], sys.stdout)
def to_GFF(args): """ Convert a GenBank or EMBL file to GFF Mainly useful for QUAST (Quality Assessment Tool for Genome Assemblies) :param args: an argparse args list """ args.in_file = os.path.expanduser(args.in_file) args.out_file = os.path.expanduser(args.out_file) in_type = "genbank" if args.embl == True: in_type = "embl" if args.getfasta == True: base = os.path.dirname(args.out_file) fasta = os.path.splitext(os.path.basename(args.out_file))[0]+'.fa' fasta_out = os.path.join(base, fasta) with open(args.in_file) as fin, open(args.out_file, 'w') as fout: GFF.write(SeqIO.parse(fin, in_type), fout) if args.getfasta == True: with open(args.in_file) as fin, open(fasta_out, 'w') as opt_out: SeqIO.write(SeqIO.parse(fin, in_type), opt_out, "fasta")
def main( sam_filename: str = typer.Argument(...), input_fasta: Optional[str] = typer.Option( None, "--input_fasta", "-i", help="(Optional) input fasta. If given, coverage will be calculated.", ), source: str = typer.Option( ..., "--source", "-s", help="source name (ex: hg38, mm10)" ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ): sam_filename = Path(sam_filename) if sam_filename.suffix != (".sam"): raise RuntimeError("Only accepts files ending in .sam. Abort!") prefix = sam_filename.stem output_gff3 = f"{prefix}.gff3" q_dict = None if input_fasta is not None: q_dict = {r.id: len(r.seq) for r in SeqIO.parse(open(input_fasta), "fasta")} with open(output_gff3, "w") as f: recs = [ convert_sam_rec_to_gff3_rec(r0, source) for r0 in GMAPSAMReader(sam_filename, True, query_len_dict=q_dict) ] BCBio_GFF.write([x for x in recs if x is not None], f) logger.info(f"Output written to {output_gff3}.")
def parse_gff(locations, gff3): locs = [] for line in locations: # Consume lines from tabular list of base locations and convert into int list line = line.strip() if line: locs.append(int(line)) #sort for speed locs.sort() for rec in GFF.parse(gff3): matched_features = [] for feat in rec.features: for loc in locs: if loc in feat: # base location is found within this feature's boudary matched_features.append(feat) elif loc > feat.location.end: # locations are now beyond this feature, skip checking break rec.features = matched_features GFF.write([rec], sys.stdout)
def check_and_dump_annotations(annotations, fasta, outfile_path): checked_annotations = ((annotation, annotation.check_annotation(fasta)) for annotation in annotations) with open(outfile_path, 'w') as outfile: for annotation, (orf, cause) in checked_annotations: if not orf: print("Gene with id={0}: {1}".format(annotation.id(), cause)) else: record = SeqRecord(fasta, str(annotation.id())) qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": annotation.id()} top_feature = SeqFeature(FeatureLocation(annotation.start(), annotation.end()), type='gene', strand=1 if annotation.is_forward() else -1, qualifiers=qualifiers) sub_qualifiers = {"source": "prediction"} top_feature.sub_features = [SeqFeature(FeatureLocation(orf[0], orf[1]), type="CDS", strand=1 if annotation.is_forward() else -1, qualifiers=sub_qualifiers)] record.features = [top_feature] GFF.write([record], outfile)
def t_write_seqrecord(self): """Write single SeqRecords. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = { "source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1" } rec.features = [ SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) ] out_handle = StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") gff_line = wrote_info[2] assert gff_line.split("\t")[0] == "ID1"
def generate_gff_from_genbank(ref_genome): """If this reference genome has a genbank but not a GFF, generate a GFF from the genbank. """ # If a GFF already exists, then just return. if ref_genome.dataset_set.filter( type=Dataset.TYPE.REFERENCE_GENOME_GFF).exists(): return # Check that a genbank exists. assert ref_genome.dataset_set.filter( type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).exists() # Get genbank path and filename components (for creating GFF file name). genbank_path = get_dataset_with_type( ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() genbank_dir, genbank_filename = os.path.split(genbank_path) genbank_noext = os.path.splitext(genbank_filename)[0] # Put the GFF file in the same dir, just change the extension to .gff. gff_filename = os.path.join(genbank_dir, (genbank_noext + '.gff')) # Get the individual records, each corresponding to a chromosome. genome_records = list(SeqIO.parse(genbank_path, 'genbank')) # SnpEFF takes the name attr, but the BioPython uses the id attr to make its # GFF file, so overwrite the id with the name when converting to GFF. for genome_record in genome_records: genome_record.name = genome_record.id GFF.write(genome_records, open(gff_filename, 'w')) dataset_type = IMPORT_FORMAT_TO_DATASET_TYPE['gff'] copy_and_add_dataset_source(ref_genome, dataset_type, dataset_type, gff_filename)
def generate_gff_from_genbank(ref_genome): """If this reference genome has a genbank but not a GFF, generate a GFF from the genbank. """ # If a GFF already exists, then just return. if ref_genome.dataset_set.filter( type=Dataset.TYPE.REFERENCE_GENOME_GFF).exists(): return # Check that a genbank exists. assert ref_genome.dataset_set.filter( type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).exists() # Get genbank path and filename components (for creating GFF file name). genbank_path = get_dataset_with_type( ref_genome, type=Dataset.TYPE.REFERENCE_GENOME_GENBANK).get_absolute_location() genbank_dir, genbank_filename = os.path.split(genbank_path) genbank_noext = os.path.splitext(genbank_filename)[0] # Put the GFF file in the same dir, just change the extension to .gff. gff_filename = os.path.join(genbank_dir, (genbank_noext + '.gff')) # Get the individual records, each corresponding to a chromosome. genome_records = list(SeqIO.parse(genbank_path, 'genbank')) # SnpEFF takes the name attr, but the BioPython uses the id attr to make its # GFF file, so overwrite the id with the name when converting to GFF. for genome_record in genome_records: genome_record.name = genome_record.id GFF.write(genome_records, open(gff_filename, 'w')) dataset_type = IMPORT_FORMAT_TO_DATASET_TYPE['gff'] copy_and_add_dataset_source(ref_genome, dataset_type, dataset_type, gff_filename)
def split_into_frames(gff3): for rec in GFF.parse(gff3): rf1 = [] rf2 = [] rf3 = [] rf4 = [] rf5 = [] rf6 = [] dummy_rec = copy.deepcopy(rec) dummy_rec.annotations = {} for gene in feature_lambda(rec.features, feature_test_type, {'types': 'gene'}, subfeatures=True): if gene.location.strand == 1: frame = str(((gene.location.start) % 3) + 1) else: frame = str((-(gene.location.start - 1) % 3) + 4) locals()['rf' + frame].append(gene) for i in range(6): dummy_rec.features = locals()['rf' + str(i + 1)] with open('rf' + str(i + 1) + '.gff3', 'a') as outfile: GFF.write([dummy_rec], outfile)
def main(): from argparse import ArgumentParser parser = ArgumentParser("Convert SAM to GFF3 format using BCBio GFF") parser.add_argument("sam_filename") parser.add_argument( "-i", "--input_fasta", default=None, help="(Optional) input fasta. If given, coverage will be calculated.") parser.add_argument("-s", "--source", required=True, help="source name (ex: hg38, mm10)") args = parser.parse_args() if not args.sam_filename.endswith('.sam'): print >> sys.stderr, "Only accepts files ending in .sam. Abort!" sys.exit(-1) prefix = args.sam_filename[:-4] output_gff3 = prefix + '.gff3' q_dict = None if args.input_fasta is not None: q_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(args.input_fasta), 'fasta')) with open(output_gff3, 'w') as f: recs = [ convert_sam_rec_to_gff3_rec(r0, args.source) for r0 in GMAPSAMReader(args.sam_filename, True, query_len_dict=q_dict) ] BCBio_GFF.write(filter(lambda x: x is not None, recs), f) print >> sys.stderr, "Output written to {0}.".format(output_gff3)
def t_write_fasta(self): """Include FASTA records in GFF output. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = { "source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1" } rec.features = [ SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) ] out_handle = StringIO.StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") fasta_parts = wrote_info[3:] assert fasta_parts[0] == "##FASTA" assert fasta_parts[1] == ">ID1 <unknown description>" assert fasta_parts[2] == str(seq)
def merge_interpro(gff3, interpro): ipr_additions = {} # blacklist = ('Name', 'ID', 'Target', 'date', 'status', 'signature_desc', 'source', 'md5', 'score') whitelist = ("Dbxref", "Ontology_term") for rec in GFF.parse(interpro): ipr_additions[rec.id] = {} for feature in rec.features: quals = feature.qualifiers for key in quals: if key not in ipr_additions[rec.id]: ipr_additions[rec.id][key] = set() for value in quals[key]: ipr_additions[rec.id][key].add(value) # Cast as a list so we aren't iterating over actual keyset. Otherwise, # we'll throw an error for modifying keyset during iteration, which we # don't really care about here. for key in list(ipr_additions[rec.id]): if key not in whitelist: del ipr_additions[rec.id][key] for rec in GFF.parse(gff3): for feature in feature_lambda(rec.features, feature_test_true, None, subfeatures=True): if feature.id in ipr_additions: for key in ipr_additions[feature.id]: if key not in feature.qualifiers: feature.qualifiers[key] = [] feature.qualifiers[key] += list( ipr_additions[feature.id][key]) rec.annotations = {} GFF.write([rec], sys.stdout)
def gb2gff(infile, outfile): """Translate GenBank file to GFF3 file. TODO: the procedure now does not handle join correctly Args: infile (str): input GenBank file outfile (str): output GFF3 file Returns: Number of records written """ gb_handle = open(infile, 'r') gff_handle = open(outfile, 'w') res = GFF.write(SeqIO.parse(gb_handle, "gb"), gff_handle) gff_handle.close() return (res)
for hit in hits: rec_a_hits_in_b.append(rec_b_map[hit]) for feature in rec_b.features: hits = tree_a.find_range( (int(feature.location.start), int(feature.location.end))) for hit in hits: rec_b_hits_in_a.append(rec_a_map[hit]) rec_a.features = set(rec_a_hits_in_b) rec_b.features = set(rec_b_hits_in_a) return rec_a, rec_b if __name__ == '__main__': parser = argparse.ArgumentParser( description='rebase gff3 features against parent locations', epilog="") parser.add_argument('a', type=argparse.FileType("r")) parser.add_argument('b', type=argparse.FileType("r")) parser.add_argument('--oa', type=str, default='a_hits_in_b.gff') parser.add_argument('--ob', type=str, default='b_hits_in_a.gff') args = parser.parse_args() b, a = intersect(args.a, args.b) with open(args.oa, 'w') as handle: GFF.write([a], handle) with open(args.ob, 'w') as handle: GFF.write([b], handle)
for record in parse_transterm(transterm_output): yield record if __name__ == '__main__': parser = argparse.ArgumentParser(description='Export corresponding sequence in genome from GFF3', epilog="") parser.add_argument('fasta', help='Fasta Genome') parser.add_argument('gff3', help='GFF3 File') parser.add_argument('--min_conf', type=int, default=76, help='Only output terminators with confidence >= n') # parser.add_argument('--gc', type=float, default=-2.3, help='Score of a G-C pair') # parser.add_argument('--au', type=float, default=-0.9, help='Score of an A-U pair') # parser.add_argument('--gu', type=float, default=1.3, help='Score of a G-U pair') # parser.add_argument('--mm', type=float, default=3.5, help='Score of any other pair') # parser.add_argument('--gap', type=int, default=6, help='Score of a gap in the hairpin') # parser.add_argument('--max_hp_score', type=float, default=-2, help='Maximum allowable hairpin score') # parser.add_argument('--max_tail_score', type=float, default=-2.5, help='Maximum allowable tail score') # parser.add_argument('--max_len', type=int, default=59, help='Total extent of hairpin <= n NT long') # parser.add_argument('--min_stem', type=int, default=4, help='Stem must be n nucleotides long') # parser.add_argument('--max_loop', type=int, default=13, help='The loop portion can be no longer than n') # parser.add_argument('--min_loop', type=int, default=3, help='Loop portion of the hairpin must be at least n long') # parser.add_argument('--uwin_require', type=int, default=3, help='Number of "U" nucleotides in the --uwin_length long region.') # parser.add_argument('--loop_penalty', default='1,2,3,4,5,6,7,8,9,10,11', help='The cost of loops of various lengths can be set using --loop_penalty=f1,f2,f3,f4,f5,...fn, where f1 is the cost of a loop of length --min_loop, f2 is the cost of a loop of length --min_loop+1, as so on. If there are too few terms to cover up to max_loop, the last term is repeated.',) args = parser.parse_args() for record in main(existing_expterm=os.path.join(SCRIPT_PATH, 'expterm.dat'), **vars(args)): GFF.write([record], sys.stdout)
def write_gff(self): # First write genes kept from the base annotation... in_handle = open(self.filtered_base_gff) recs = [] all_merged = [j for i in self.blacklist_merged.values() for j in i] self.num_merged = len(all_merged) for rec in GFF.parse(in_handle): rec.annotations = {} rec.seq = "" new_feats = [] for f in rec.features: # gene gene_id = f.qualifiers['ID'][0] search = re.search(self.id_regex, gene_id) raw_gene_id = self.id_syntax.replace('{id}', search.group(1)) if raw_gene_id not in self.blacklist_base and raw_gene_id not in all_merged: cleaned_f = self.clean_feature(f) cleaned_f = self.guess_exons(cleaned_f) cleaned_f = self.renumber_exons(cleaned_f) new_feats.append(cleaned_f) self.num_kept_base += 1 self.num_total_genes += 1 else: self.num_replaced_base += 1 rec.features = new_feats if len(rec.features): recs.append(rec) in_handle.close() # ... then write genes models from apollo in_handle = open(self.apollo_gff) for rec in GFF.parse(in_handle): rec.annotations = {} rec.seq = "" new_feats = [] for f in rec.features: # gene gene_id = f.qualifiers['ID'][0] if gene_id in self.name_map: f.qualifiers['Alias'] = [gene_id] f.qualifiers['ID'][0] = self.name_map[gene_id] nfeat = self.clean_feature(f) nfeat = self.guess_utrs(nfeat) nfeat = self.renumber_exons(nfeat) new_feats.append(nfeat) self.num_total_genes += 1 rec.features = new_feats if len(rec.features): recs.append(rec) in_handle.close() out_handle = open(self.out_gff, "w") GFF.write(recs, out_handle) out_handle.close()
def main(gb_file): out_file = "%s.gff" % os.path.splitext(gb_file)[0] with open(out_file, "w") as out_handle: GFF.write(SeqIO.parse(gb_file, "genbank"), out_handle)
# -*- coding: utf-8 -*- # @Author: Marylette B. Roa # @Date: 2018-03-08 20:16:26 # @Last Modified by: Marylette B. Roa # @Last Modified time: 2018-07-18 10:36:23 import sys sys.path.append('/mnt/e/virtual_envs/windowsEnv/lib/python3.4/site-packages') from Bio import SeqIO from BCBio import GFF in_file = "sequence.gb" out_file = "sequence.gff" in_handle = open(in_file) out_handle = open(out_file, "w") GFF.write(SeqIO.parse(in_handle, "genbank"), out_handle) in_handle.close() out_handle.close()
parser.add_argument("-o", "--output_file", action="store", dest="output_file", help="Output file with extracted_annotations") parser.add_argument("-d", "--ids_file", action="store", dest="ids_file", help="File with ids of annotations to extract") parser.add_argument("-t", "--annotation_types", action="store", dest="annotation_types", default=["gene"], type=lambda s: s.split(","), help="Comma-separated list of annotation types to extract") args = parser.parse_args() annotation_ids = IdList() annotation_ids.read(args.ids_file, comments_prefix="#") #print args.annotation_types out_fd = open(args.output_file, "w") GFF.write( record_with_extracted_annotations_generator(args.input_gff, args.annotation_types), out_fd) out_fd.close()
def cigar_from_string(query, match, subject, strict_m=True): matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) if len(matchline) > 0: return _matchline_to_cigar(matchline) else: return "" if __name__ == '__main__': parser = argparse.ArgumentParser( description='Convert Blast XML to gapped GFF3', epilog='') parser.add_argument('blastxml', type=open, help='Blast XML Output') parser.add_argument( '--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) parser.add_argument( '--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') args = parser.parse_args() result = blastxml2gff3(**vars(args)) GFF.write(result, sys.stdout)
record.features = [] tmprec = copy.deepcopy(record) tmprec.annotations = {} tmprec.features = [] record.features = record_features for feature in record.features: props = [] if 'record_id' in args.keys: props.append(record.id) if 'source' in args.keys: props.append(feature.qualifiers['source'][0]) if 'target' in args.keys: props.append(feature.qualifiers['Target'][0]) propkey = '|'.join(map(str, props)) if propkey not in file_handles: filename = args.joiner.join(props) path = os.path.join('out', filename + '.gff3') logging.info("Opening %s", path) file_handles[propkey] = open(path, 'a') tmprec.features = [feature] GFF.write([tmprec], file_handles[propkey]) # SeqIO.write([record], args.fasta, 'fasta') # sys.exit() for key in file_handles: file_handles[key].close()
gene_end = end gene = SeqFeature(FeatureLocation(gene_start, gene_end), type="gene", strand=strand, qualifiers={ 'Source': 'MGA', 'ID': '%s.%s' % (current_record.id, gene_id), }) gene.sub_features = [cds_feat] if rbs_feat is not None: gene.sub_features.append(rbs_feat) current_record.features.append(gene) yield current_record if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert MGA to GFF3', epilog="") parser.add_argument('mga_output', type=argparse.FileType("r"), help='MetaGeneAnnotator Output') parser.add_argument('genome', type=argparse.FileType("r"), help='Fasta Genome') args = parser.parse_args() for result in mga_to_gff3(**vars(args)): GFF.write([result], sys.stdout)
def export_gff3(self, organism_id): """ Export organism features as GFF3 :type organism_id: int :param organism_id: Organism ID :rtype: None :return: None """ # check if the organism exists res = self.session.query(self.model.organism).filter( self.model.organism.organism_id.in_([organism_id])) sys.stderr.write("Processing %s sequences\n" % res.count()) for org in res: # TODO: can we do this properly? seq = Seq("A" * 1, IUPAC.unambiguous_dna) # Annotation features features = self.session.query(self.model.feature, self.model.featureloc) \ .filter_by(organism_id=org.organism_id) \ .join(self.model.featureloc, self.model.feature.feature_id == self.model.featureloc.feature_id, isouter=True) sys.stderr.write("\tProcessing %s features\n" % features.count()) biopy_features = {} idx = 0 for idx, (feature, featureloc) in enumerate(features): if idx % 5000 == 0: sys.stderr.write("\t%s / %s\n" % (idx, features.count())) # [u'dbxref_id', u'feature_id', u'is_analysis', u'is_obsolete', # u'md5checksum', u'name', u'organism_id', u'residues', u'seqlen', # u'timeaccessioned', u'timelastmodified', u'type_id', # u'uniquename'] # [u'feature_id', u'featureloc_id', u'fmax', u'fmin', # u'is_fmax_partial', u'is_fmin_partial', u'locgroup', u'phase', # u'rank', u'residue_info', u'srcfeature_id', u'strand'] qualifiers = { self.ci.get_cvterm_name(prop.type_id): prop.value for prop in self.session.query(self.model.featureprop). filter_by(feature_id=feature.feature_id).all() } qualifiers['ID'] = feature.uniquename biopy_features[feature.feature_id] = SeqFeature( location=BioFeatureLocation(featureloc.fmin, featureloc.fmax) if featureloc else None, id=feature.uniquename, type=self.ci.get_cvterm_name(feature.type_id), strand=featureloc.strand if featureloc else None, qualifiers=qualifiers) sys.stderr.write("\t%s / %s\n" % (idx + 1, features.count())) # res = self.session.query(self.model.organism).filter(self.model.organism.organism_id.in_(organism_id)) relationships = self.session.query(self.model.feature_relationship) \ .filter(self.model.feature_relationship.subject_id.in_(biopy_features.keys())) sys.stderr.write("\tProcessing %s relationships\n" % relationships.count()) # feature_relationship_id | subject_id | object_id | type_id | value | rank # -------------------------+------------+-----------+---------+-------+------ # 1 | 4 | 3 | 37 | | 0 # 2 | 5 | 4 | 37 | | 0 # 3 | 6 | 4 | 37 | | 0 # 4 | 7 | 4 | 37 | | 0 features = [] def findById(feature_list, id): for feature in feature_list: if feature.id == id: yield feature if hasattr(feature, 'sub_features'): for x in findById(feature.sub_features, id): yield x # Now to re-parent things properly idx = 0 for idx, rel in enumerate(relationships): if idx % 5000 == 0: sys.stderr.write("\t%s / %s\n" % (idx, relationships.count())) term = self.ci.get_cvterm_name(rel.type_id) if term != 'part_of': sys.stderr.write( "\tCannot handle non-part_of relationships (%s %s %s)\n" % (rel.subject_id, term, rel.object_id)) continue # Try and find the features in features. child = list( findById(features, biopy_features[rel.subject_id].id)) parent = list( findById(features, biopy_features[rel.object_id].id)) assert len(child) <= 1 assert len(parent) <= 1 alreadyProcessedParent = False alreadyProcessedChild = False # If they aren't there, pull them from the complete set. if len(child) == 0: child = biopy_features[rel.subject_id] else: child = child[0] alreadyProcessedChild = True if len(parent) == 0: parent = biopy_features[rel.object_id] else: parent = parent[0] alreadyProcessedParent = True if not hasattr(parent, 'sub_features'): parent.sub_features = [] parent.sub_features.append(child) if alreadyProcessedChild and alreadyProcessedParent: # Here we've seen both (they're BOTH in the list), so we need to remove # child and not touch parent since we added to parent already if child in features: features.remove(child) elif alreadyProcessedChild and not alreadyProcessedParent: # Here our child is already in features, so we need to remove it from # the feature set, add to the parent (done) and re-place in features. features.remove(child) features.append(parent) elif not alreadyProcessedChild and alreadyProcessedParent: # In this case we've seen the parent before, already in list, no need to do anything # features.append(parent) pass else: # Otherwise, completely new feature. features.append(parent) sys.stderr.write("\t%s / %s\n" % (idx + 1, relationships.count())) n = org.common_name if org.common_name else 'org_%s' % org.organism_id record = SeqRecord( seq, id=n, name=n, description="%s %s" % (org.genus, org.species), ) record.features = sorted(features, key=lambda f: f.location.start) GFF.write([record], sys.stdout) return "" # Return an empty string to avoid getting a "None" when printing on stdout
def shinefind( genbank_file, gff3_output=None, table_output=None, lookahead_min=5, lookahead_max=15, top_only=False, add=False, ): table_output.write("\t".join([ "ID", "Name", "Terminus", "Terminus", "Strand", "Upstream Sequence", "SD", "Spacing", ]) + "\n") sd_finder = NaiveSDCaller() # Parse GFF3 records for record in list(SeqIO.parse(genbank_file, "genbank")): # Sometimes you have a case where TWO CDS features have the same start. Only handle ONE. seen = {} # Shinefind's "gff3_output". gff3_output_record = SeqRecord(record.seq, record.id) # Loop over all CDS features for feature in record.features: if feature.type != "CDS": continue seen_loc = (feature.location.start if feature.strand > 0 else feature.location.end) if seen_loc in seen: continue else: seen[seen_loc] = True sds, start, end, seq = sd_finder.testFeatureUpstream( feature, record, sd_min=lookahead_min, sd_max=lookahead_max) feature_id = get_id(feature) sd_features = sd_finder.to_features(sds, feature.location.strand, start, end, feature_id=feature.id) human_strand = "+" if feature.location.strand == 1 else "-" # http://book.pythontips.com/en/latest/for_-_else.html log.debug("Found %s SDs", len(sds)) for (sd, sd_feature) in zip(sds, sd_features): # If we only want the top feature, after the bulk of the # forloop executes once, we append the top feature, and fake a # break, because an actual break triggers the else: block table_output.write("\t".join( map( str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, sd_finder.highlight_sd(seq, sd["start"], sd["end"]), sd["hit"], int(sd["spacing"]) + lookahead_min, ], )) + "\n") if add: # Append the top RBS to the gene feature record.features.append(sd_feature) # Also register the feature with the separate GFF3 output gff3_output_record.features.append(sd_feature) if top_only: break else: if len(sds) != 0: log.debug("Should not reach here if %s", len(sds) != 0) # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem. continue table_output.write("\t".join( map( str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, seq, None, -1, ], )) + "\n") record.features = sorted(record.features, key=lambda x: x.location.start) SeqIO.write([record], sys.stdout, "genbank") gff3_output_record.features = sorted(gff3_output_record.features, key=lambda x: x.location.start) gff3_output_record.annotations = {} GFF.write([gff3_output_record], gff3_output)
o_data.append(float(parts[3])) m_data.append(float(parts[4])) bigwig_store(bw_i, record.id, i_data) bigwig_store(bw_o, record.id, o_data) bigwig_store(bw_m, record.id, m_data) yield record if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process TMHMM outputs in GFF3, BigWig') parser.add_argument('--bw_i', default='tmhmm_i.wig') parser.add_argument('--bw_o', default='tmhmm_o.wig') parser.add_argument('--bw_m', default='tmhmm_m.wig') args = parser.parse_args() bw_i = open(args.bw_i, 'w') bw_o = open(args.bw_o, 'w') bw_m = open(args.bw_m, 'w') bigwig_add_header(bw_i, 'i', name='TMHMM') bigwig_add_header(bw_o, 'o', name='TMHMM') bigwig_add_header(bw_m, 'm', name='TMHMM') for sequence in convert(None, bw_i, bw_o, bw_m): GFF.write([sequence], sys.stdout) bw_i.close() bw_o.close() bw_m.close()
if fasta: if len(rec.seq) == rec.seq.count("?"): log.error( "ERROR: You have provided a fasta file but the sequence ID in the fasta file DID NOT MATCH THE GFF. THIS IS BAD." ) yield rec if __name__ == '__main__': parser = argparse.ArgumentParser( description='Reopen a set of GFF3 annotations') parser.add_argument('gff3', type=argparse.FileType("r"), help='GFF3 annotations') parser.add_argument('--fasta', type=argparse.FileType("r"), help='Optional fasta file') parser.add_argument('--fasta_output', type=argparse.FileType("w"), help='Optional fasta file output', default='reopened.fasta') parser.add_argument('index', type=int, help='Index to reopen genome at') args = parser.parse_args() for rec in gff_reopen(**vars(args)): GFF.write([rec], sys.stdout) if args.fasta: SeqIO.write([rec], args.fasta_output, 'fasta')
#!/usr/bin/env python import sys import argparse from Bio import SeqIO from BCBio import GFF if __name__ == "__main__": parser = argparse.ArgumentParser( description= "Sample script to add an attribute to a feature via web services") parser.add_argument("data", type=argparse.FileType("r"), help="GFF3 File") parser.add_argument( "--gff", type=argparse.FileType("w"), help="Output Annotations", default="data.gff3", ) parser.add_argument( "--fasta", type=argparse.FileType("w"), help="Output Sequence", default="data.fa", ) args = parser.parse_args() for record in GFF.parse(args.data): GFF.write([record], args.gff) record.description = "" SeqIO.write([record], args.fasta, "fasta") sys.exit()