def gene_sequence_helper(assembly, geneid): db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True) gene = db[geneid] sf = biopython_integration.to_seqfeature(gene) seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly)) return seq_letters
def test_roundtrip(): """ Feature -> SeqFeature -> Feature should be invariant. """ db_fname = gffutils.example_filename("gff_example1.gff3") db = gffutils.create_db(db_fname, ':memory:') feature = db['ENSMUSG00000033845'] feature.keep_order = True dialect = feature.dialect s = bp.to_seqfeature(feature) assert s.location.start.position == feature.start - 1 assert s.location.end.position == feature.stop f = bp.from_seqfeature(s, dialect=dialect, keep_order=True) assert feature == f
def parse_cds_features(features, record_start): cds = [] gene = [] for feature in features: feature = biopython_integration.to_seqfeature(feature) feature.location = FeatureLocation(feature.location.start - record_start, feature.location.end - record_start, strand=feature.location.strand) if feature.type == "CDS": cds.append(feature) else: gene.append(feature) return cds, gene
def gene_fasta_helper(assembly, geneid, returntype="filename"): db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True) gene = db[geneid] sf = biopython_integration.to_seqfeature(gene) seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly)) record = SeqRecord(Seq(seq_letters,Alphabet.DNAAlphabet()), id=geneid, name=gene.attributes["Name"][0], description="gene region exported by crispr.mit.edu", features=[sf]) fname = "/fastdata/webserver/tmp/{0}.fa".format(long(random.random()*1000000)) with open(fname,"w") as f: SeqIO.write(record,f,"fasta") if returntype=="filename": return fname elif returntype =="text": with open(fname) as fopen: return fopen.read() else: raise Exception("unknown return type {0}".format(returntype))
def parse_gff(path): """Parses a GFF3 file using GFFUtils.""" # Check for FASTA file fasta_path = find_fasta(path) if not fasta_path: raise FileNotFoundError("Could not find matching FASTA file") # Parse FASTA and create GFFUtils database fasta = parse_fasta(fasta_path) gff = gffutils.create_db(str(path), ":memory:", force=True, merge_strategy="create_unique", sort_attribute_values=True) regions = find_regions(gff.directives) # Find features for each record in the FASTA file loci = [] for record in fasta: try: record_start, _ = regions[record.id] except KeyError: record_start = 1 features = list(gff.region(seqid=record.id, featuretype="CDS")) features.sort(key=lambda f: f.start) if not features: raise ValueError(f"Found no CDS features in {record.id} [{path}]") # Calculate offset based on start of record # sequence-region not zero-indexed, so +1 record_start -= 1 previous = None for feature in features: # Check if this feature is part of the previous one for merging seqid = feature.attributes["ID"][0] same_feature = previous == seqid if not previous: previous = seqid # Normalise Feature location based on ##sequence-region directive. # Necessary for extracted GFF3 files that still store coordinates # relative to the entire region. If no sequence-region directive # is found, assumes 1 (i.e. default sequence start). # Note: to_seqfeature automatically zero indexes coordinates, which # does not happen by default in GFFUtils, hence no -1 here feature = biopython_integration.to_seqfeature(feature) feature.location = FeatureLocation( feature.location.start - record_start, feature.location.end - record_start, strand=feature.location.strand) # Either merge with previous feature, or append it if same_feature: if feature.location.strand == 1: record.features[-1].location += feature.location else: # Must be in biological order old, new = record.features[-1].location, feature.location record.features[-1].location = new + old else: record.features.append(feature) previous = seqid # Try to trace back from CDS to parent gene feature for actual # gene coordinates. If not found (e.g. malformed GFF without ID= and parent= # features), warns user and defaults to CDS start/end. genes = [] for feature in record.features: parents = [ p for p in gff.parents(gff[feature.id], featuretype="gene") ] start, end = None, None if parents: start = parents[0].start - record_start - 1 end = parents[0].end - record_start else: LOG.warning(f"Could not find parent gene of {feature.id}." " Using coding sequence coordinates instead.") gene = Gene.from_seqfeature(feature, record, start=start, end=end) genes.append(gene) locus = Locus(record.id, genes, 0, len(record)) loci.append(locus) return Cluster(Path(path).stem, loci)
def gene_genbank_spacers_data_helper(data, assembly, geneid, returntype="filename", spacer_sequence_filter=None, tool_filter=None, min_score=90): db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True) gene = db[geneid] sf = biopython_integration.to_seqfeature(gene) seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly)) cas9_spacers = data["cas9"]["spacers"] cpf1_spacers = data["cas9"]["spacers"] sfs = [] count=0 for tool,spacer_list in {"cas9":cas9_spacers, "cpf1":cpf1_spacers}.items(): if tool_filter != None: if tool != tool_filter: continue for s in spacer_list: if spacer_sequence_filter: if s["guide_sequence"] != spacer_sequence_filter: continue if min_score != None: #print s["score"] if s["score"] < min_score: continue quals = {} if s["pam_before"]: quals.update({"upstream_pam":s["pam_before"]}) if s["pam_after"]: quals.update({"downstream_pam":s["pam_after"]}) quals.update({"score":s["score"], "tool":tool, "target_seq":s["guide_sequence"]}) sfs.append(SeqFeature(FeatureLocation(s["guide_start"], s["guide_start"]+s["guide_length"], strand=s["guide_strand"]), id="guide{0}".format(count), qualifiers=quals, type="{0}_guide".format(tool))) count+=1 record = SeqRecord(Seq(seq_letters,Alphabet.DNAAlphabet()), id=geneid, name=gene.attributes["Name"][0], description="{2} gene {1} exported by crispr.mit.edu, with all spacer sequences scored >{0}".format(min_score,db[geneid].attributes['Name'][0],assembly), features=[sf]+sfs, annotations={"organism":assembly}) fname = "/fastdata/webserver/tmp/{0}.gb".format(long(random.random()*1000000)) with open(fname,"w") as f: SeqIO.write(record,f,"genbank") if returntype=="filename": return fname elif returntype=="text": with open(fname) as fopen: return fopen.read() else: raise Exception("unknown return type {0}".format(returntype))
def getRandomFusions(db, names, num=5, pStay=0.0): # db is the database from module.py # names is a vector of the ENSG gene ids (protein coding genes only) from module.py # num: number of fusions to simulate # pStay: the probability of staying in the same gene pair to generate another fusion isoform. Set to 0.0 to get only one isoform. #random.seed(time.time()) #for final code, add parameters for seed res = list() # the list to store the dictionaries for fusions # donorTranId, acceptorTranId donorJunction, acceptorJunction if len(names) < 2: print("Not enough protein coding genes.") exit(1) total = 0 tossed = 0 while total < num: # Select genes. dId = random.randint(0, len(names) - 1) aId = random.randint(0, len(names) - 1) # Discard the result if the genes selected are the same. if dId == aId: tossed = tossed + 1 if tossed > MAX_TOSS_NUM: print("Tossed > " + str(MAX_TOSS_NUM) + " times in generating a pair of genes.") exit(1) continue dGene = db[names[dId]] aGene = db[names[aId]] # Decide whether to keep the same transcript pair for the next fusion event. tossed2 = 0 keepSame = True while keepSame is True: keepSame = isStay(pStay) # Choose transcripts dStrand, dTran = getTranscript(db, dGene) aStrand, aTran = getTranscript(db, aGene) if (dTran is None) or (aTran is None): continue # Choose junctions dIsSucess, dExons = getExons(db, dTran) aIsSucess, aExons = getExons(db, aTran) if dIsSucess and aIsSucess: dExonSF = list() aExonSF = list() for exon in dExons: dExonSF.append(biopython_integration.to_seqfeature(exon)) for exon in aExons: aExonSF.append(biopython_integration.to_seqfeature(exon)) if (len(dExonSF) > 0) and (len(aExonSF) > 0): # create fusion event object, and adjust junction positions # to be 0-based fus = FusionEvent(dExonSF, aExonSF, dStrand, aStrand) res.append(fus) total = total + 1 else: tossed2 = tossed2 + 1 if tossed2 > MAX_TOSS_NUM: print("Tossed > " + str(MAX_TOSS_NUM) + " times in generating fusion junctions.") exit(1) return (res)
def cluster_from_gff(path, ranges=None): """Parses a GFF3 file using GFFUtils.""" # Check for FASTA file fasta_path = find_fasta(path) if not fasta_path: raise FileNotFoundError("Could not find matching FASTA file") # Parse FASTA and create GFFUtils database fasta = parse_fasta(fasta_path) gff = gffutils.create_db(str(path), ":memory:", force=True, merge_strategy="create_unique", sort_attribute_values=True) regions = find_regions(gff.directives) # Find features for each record in the FASTA file loci = [] for record in fasta: # Check for matching ##sequence-region directive try: record_start, record_end = regions[record.id] except KeyError: record_start, record_end = 1, len(record) # Check for user-specified range if ranges and record.id in ranges: record_start, record_end = ranges[record.id] LOG.info(" Parsing range %s:%i-%i", record.id, record_start, record_end) # Adjust FASTA record to match record_start and record_end # -- Default: 0 to end of record # -- ##sequence-region: start to end of directive # -- User-specified range: start to end of range record = record[record_start - 1:record_end] # Zero-index the start of the record record_start -= 1 # Extract features from record within range region = gff.region( seqid=record.id, featuretype="CDS", start=record_start, end=record_end, completely_within=True, ) features = sorted(region, key=lambda f: f.start) if not features: raise ValueError(f"Found no CDS features in {record.id} [{path}]") previous = None for feature in features: # Check if this feature is part of the previous one for merging seqid = feature.attributes["ID"][0] same_feature = previous == seqid if not previous: previous = seqid # Normalise Feature location based on ##sequence-region directive. # Necessary for extracted GFF3 files that still store coordinates # relative to the entire region. If no sequence-region directive # is found, assumes 1 (i.e. default sequence start). # Note: to_seqfeature automatically zero indexes coordinates, which # does not happen by default in GFFUtils, hence no -1 here feature = biopython_integration.to_seqfeature(feature) feature.location = FeatureLocation( feature.location.start - record_start, feature.location.end - record_start, strand=feature.location.strand) # Either merge with previous feature, or append it if same_feature: if feature.location.strand == 1: record.features[-1].location += feature.location else: # Must be in biological order old, new = record.features[-1].location, feature.location record.features[-1].location = new + old else: record.features.append(feature) previous = seqid # Try to trace back from CDS to parent gene feature for actual # gene coordinates. If not found (e.g. malformed GFF without ID= and parent= # features), warns user and defaults to CDS start/end. genes = [] for feature in record.features: parents = [ p for p in gff.parents(gff[feature.id], featuretype="gene") ] if parents: # e.g. CDS is within range, but gene UTR is not parent, *_ = parents if parent.start < record_start or parent.end > record_end: continue start = parent.start end = parent.end else: LOG.warning(f"Could not find parent gene of {feature.id}." " Using coding sequence coordinates instead.") start = feature.location.start + record_start end = feature.location.end + record_start gene = Gene.from_seqfeature(feature, record, start=start, end=end) genes.append(gene) locus = Locus(record.id, genes, start=record_start, end=record_end) loci.append(locus) return Cluster(Path(path).stem, loci)
def gene_genbank_spacers_helper(assembly, geneid, returntype="filename", spacer_sequence_filter=None, tool_filter=None, min_score=90): db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True) gene = db[geneid] sf = biopython_integration.to_seqfeature(gene) seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly)) gene_queries_directory = "/fastdata/crispr/gene_queries" query_data_basename = "{assembly}_{geneid}_data.json".format(assembly=assembly,geneid=geneid) query_data_file = os.path.join(gene_queries_directory,query_data_basename) with open(query_data_file) as fopen: status = sjson.loads(fopen.next()) data = sjson.loads(fopen.next()) cas9_spacers = data["cas9"]["spacers"] cpf1_spacers = data["cpf1"]["spacers"] sfs = [] count=0 for tool,spacer_list in {"cas9":cas9_spacers, "cpf1":cpf1_spacers}.items(): if tool_filter != None: if tool != tool_filter: continue print tool for s in spacer_list: if spacer_sequence_filter: if type(spacer_sequence_filter) == str or type(spacer_sequence_filter) == unicode: if s["guide_sequence"] != spacer_sequence_filter: continue elif type(spacer_sequence_filter) == list: if not s["guide_sequence"] in spacer_sequence_filter: continue if min_score != None: print s["score"] if s["score"] < min_score: continue quals = {} if s["pam_before"]: quals.update({"upstream_pam":s["pam_before"]}) if s["pam_after"]: quals.update({"downstream_pam":s["pam_after"]}) quals.update({"score":s["score"], "tool":tool, "target_seq":s["guide_sequence"]}) ot_mms = dict([["offtarget_{0}_mms".format(i),ot["mismatches"]] for i,ot in enumerate( s["offtarget_alignments"]) ]) quals.update(ot_mms) ot_loci = dict([["offtarget_{0}_locus".format(i),"{0} {1}{2}".format(ot["chrom"],ot["strand"],ot["start"])] for i,ot in enumerate( s["offtarget_alignments"]) ]) quals.update(ot_loci) sfs.append(SeqFeature(FeatureLocation(s["guide_start"], s["guide_start"]+s["guide_length"], strand=s["guide_strand"]), id="guide{0}".format(count), qualifiers=quals, type="{0}_guide".format(tool))) count+=1 record = SeqRecord(Seq(seq_letters,Alphabet.DNAAlphabet()), id=geneid, name=gene.attributes["Name"][0], description="{2} gene {1} exported by crispr.mit.edu, with all spacer sequences scored >{0}".format(min_score,db[geneid].attributes['Name'][0],assembly), features=[sf]+sfs, annotations={"organism":assembly}) fname = "/fastdata/webserver/tmp/{0}.gb".format(long(random.random()*1000000)) with open(fname,"w") as f: SeqIO.write(record,f,"genbank") if returntype=="filename": return fname elif returntype=="text": with open(fname) as fopen: return fopen.read() else: raise Exception("unknown return type {0}".format(returntype))
def parse_gff(path): """Parses GFF and corresponding FASTA using GFFutils. Args: path (str): Path to GFF file. Should have a corresponding FASTA file of the same name with a valid FASTA suffix (.fa, .fasta, .fsa, .fna, .faa). Returns: list: SeqRecord objects corresponding to each scaffold in the file """ fasta = find_fasta(path) if not fasta: raise FileNotFoundError(f"Could not find partner FASTA file for {path}") # Parse FASTA and create GFFUtils database fasta = parse_fasta(fasta) gff = gffutils.create_db( str(path), ":memory:", force=True, merge_strategy="create_unique", sort_attribute_values=True ) regions = find_regions(gff.directives) # Find features for each record in the FASTA file for record in fasta: try: record_start, _ = regions[record.id] record_start -= 1 except KeyError: record_start = 0 # Normalise Feature location based on ##sequence-region directive. # Necessary for extracted GFF3 files that still store coordinates # relative to the entire region, not to the extracted FASTA. # If no sequence-region directive is found, assumes 1 (i.e. sequence start). cds_features = [] for feature in gff.region(seqid=record.id, featuretype=["gene", "CDS"]): feature = biopython_integration.to_seqfeature(feature) feature.location = FeatureLocation( feature.location.start - record_start, feature.location.end - record_start, strand=feature.location.strand ) if feature.type == "CDS": cds_features.append(feature) else: record.features.append(feature) if not cds_features: raise ValueError(f"Found no CDS features in {record.id} [{path}]") # Merge CDS features into singular SeqFeature objects, add them to record previous = None for feature in sorted(cds_features, key=lambda f: f.location.start): seqid = feature.qualifiers["ID"][0] same_feature = previous == seqid if not previous: previous = seqid if same_feature: if feature.location.strand == 1: record.features[-1].location += feature.location else: # Reverse strand locations must be in biological order old, new = record.features[-1].location, feature.location record.features[-1].location = new + old else: record.features.append(feature) previous = seqid # Sort, then generate insertion tuples like with other formats record.features.sort(key=lambda f: f.location.start) return fasta