def fetch(args): fasta = Fasta(args.fasta) regions = args.regions if args.list: with args.list as listfile: for region in listfile: regions.append(region.rstrip()) for region in regions: region = region.split()[0] try: rname, interval = region.split(':') except ValueError: rname = region interval = None try: start, end = interval.split('-') sequence = fasta[rname][int(start) - 1:int(end)] except (AttributeError, ValueError): sequence = fasta[rname][:] if args.complement: sequence = sequence.complement if args.reverse: sequence = sequence.reverse line_len = fasta[rname]._fa.faidx.index[rname]['lenc'] if args.name: sys.stdout.write('>' + sequence.name + '\n') for line in wrap_sequence(line_len, sequence.seq): sys.stdout.write(line) else: for line in wrap_sequence(line_len, sequence.seq): sys.stdout.write(line) fasta.close()
def fa_to_tabular(ifn, oufn, start, end): fa = Fasta(ifn) fout = open(oufn, 'w') fout.write("\t".join(["mm","hash","umi", "counts","seq"]) + '\n') for i in fa: name = i.name.split("_") mm = name[0] hash_key = name[1] umi = name[2] counts = name [3] seq = i[:].seq[start:end] fout.write("\t".join([mm, hash_key, umi, counts, seq])+'\n') fout.close() fa.close()
def _get_seq(self, chrom, start, stop): if self.in_mem: seq = self.fasta[chrom][start:stop] else: if self.thread_safe: fasta = Fasta(self.fasta, as_raw=True, sequence_always_upper=True, read_ahead=self.read_ahead) seq = np.array(list(fasta[chrom][start:stop])) fasta.close() else: seq = np.array(list(self.fasta[chrom][start:stop])) return seq
def test_reverse_var(self): ''' check that reverse_var works correctly ''' genome = Fasta(self.fa) var = self.Var(chrom='chrN', pos=11, ref='G', alts=['A', 'C']) rev = reverse_var(var, genome) self.assertEqual(rev.ref, 'C') # the position stays the same self.assertEqual(var.pos, 11) # multi-allelic variants with indels return None var = self.Var(pos=10, ref='G', alts=['A', 'CC'], info={}) self.assertIsNone(reverse_var(var, genome)) genome.close()
class FastaStringExtractor(BaseExtractor): """Fasta file extractor NOTE: The extractor is not thread-save. If you with to use it with multiprocessing, create a new extractor object in each process. # Arguments fasta_file (str): path to the fasta_file use_strand (bool): if True, the extracted sequence is reverse complemented in case interval.strand == "-" force_upper (bool): Force uppercase output """ def __init__(self, fasta_file, use_strand=False, force_upper=False): from pyfaidx import Fasta self.fasta_file = fasta_file self._use_strand = use_strand self.fasta = Fasta(self.fasta_file) self.force_upper = force_upper def extract(self, interval: Interval, use_strand=None, **kwargs) -> str: """ Returns the FASTA sequence in some given interval as string Args: interval: the interval to query use_strand (bool, optional): if True, the extracted sequence is reverse complemented in case interval.strand == "-". Overrides `self.use_strand` **kwargs: Returns: sequence of requested interval """ # reverse-complement seq the negative strand if use_strand is None: use_strand = self.use_strand rc = use_strand and interval.strand == "-" # pyfaidx wants a 1-based interval seq = str(self.fasta.get_seq( interval.chrom, interval.start + 1, interval.stop, rc=rc ).seq) # optionally, force upper-case letters if self.force_upper: seq = seq.upper() return seq def close(self): return self.fasta.close()
class FastaWrapper(GenomeWrapper): def __init__(self, fasta_file, alpha='dna', one_hot=True, channel_last=True, in_mem=False, thread_safe=False, read_ahead=10000): super().__init__(alpha, one_hot, channel_last, in_mem, thread_safe) self.fasta = Fasta(fasta_file, as_raw=True, sequence_always_upper=True, read_ahead=read_ahead) self._chroms = list(self.fasta.keys()) seq_lens = [len(self.fasta[chrom]) for chrom in self._chroms] self._chroms_size = dict(zip(self._chroms, seq_lens)) self.read_ahead = read_ahead if in_mem: fasta_onehot_dict = self._encode_seqs(self.fasta) self.fasta.close() self.fasta = fasta_onehot_dict self.thread_safe = True else: if thread_safe: self.fasta.close() self.fasta = fasta_file def close(self): if not self.thread_safe: self.fasta.close() @staticmethod def _encode_seqs(fasta): # Converts a FASTA object into a dictionary of one-hot coded boolean matrices fasta_dict = {} pbar = tqdm(fasta) for record in pbar: pbar.set_description(desc='Loading sequence: ' + record.name) seq = record[:] seq = np.array(list(seq)) fasta_dict[record.name] = seq return fasta_dict def _get_seq(self, chrom, start, stop): if self.in_mem: seq = self.fasta[chrom][start:stop] else: if self.thread_safe: fasta = Fasta(self.fasta, as_raw=True, sequence_always_upper=True, read_ahead=self.read_ahead) seq = np.array(list(fasta[chrom][start:stop])) fasta.close() else: seq = np.array(list(self.fasta[chrom][start:stop])) return seq
def integrate(self, output_table, gtf_file, fasta_file): log.info("Integrating results") def insert_in_index(index, entries, score, i): if score not in index: index[score] = {} key = entries[0].chrA + ':' + str( entries[0].posA ) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str( entries[0].posB) + '(' + entries[0].strandB + ')|' + str(i) index[score][key] = entries with open(output_table, 'w') as fh_out: header = self.header.split("\t") header = "\t".join(header[:-5] + [ 'full-gene-dysregulation', 'frameshift=0', 'frameshift=+1', 'frameshift=+2', 'splice-motif-edit-distance', "exons from (5')", "exons to (3')" ] + header[-5:]) fh_out.write("shared-id\tfusion\t" + header) # index used to find duplicates self.idx = HTSeq.GenomicArrayOfSets("auto", stranded=True) # index used to annotate gene names: TMPRSS2->ERG gene_annotation = GeneAnnotation(gtf_file) dfs = DetectFrameShifts(gtf_file) if gtf_file else None ffs = Fasta(fasta_file) if fasta_file else None intronic_linear = [] remainder = [] # Find 'duplicates' or fusions that belong to each other log.info( "Searching for intronic and exonic breaks that belong to the same event" ) for e in self: if dfs and e.RNAstrandA != '.' and e.RNAstrandB != '.': done_breaks = set([]) if e.donorA > e.donorB: exons_from, exons_to, frame_shifts = dfs.evaluate( [e.chrA, e.posA, e.RNAstrandA], [e.chrB, e.posB, e.RNAstrandB], 2) else: exons_from, exons_to, frame_shifts = dfs.evaluate( [e.chrB, e.posB, e.RNAstrandB], [e.chrA, e.posA, e.RNAstrandA], 2) done_breaks.add(e.chrA + ':' + str(e.posA) + '/' + str(e.posA + 1) + '(' + e.strandA + ')->' + e.chrB + ':' + str(e.posB) + '/' + str(e.posB + 1) + '(' + e.strandB + ')') fgd = [x[0] + '->' + x[1] for x in frame_shifts['fgd']] frameshifts_0 = [ x[0][0] + '->' + x[1][0] for x in frame_shifts[0] ] frameshifts_1 = [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1] ] frameshifts_2 = [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2] ] for additional_breaks in e.structure.split('&'): if additional_breaks != '': params = additional_breaks.split(':(') n_split_reads = sum([ int(x.split(':')[1]) for x in params[1].rstrip(')').split(',') if x.split(':')[0] != 'discordant_mates' ]) posAB = params[0].split(':') posA, posB = int(posAB[1].split('/')[0]), int( posAB[2].split('/')[0]) if params[ 0] not in done_breaks and n_split_reads > 0: if e.donorA > e.donorB: # nice, use same thing to swap if necessary exons_from_, exons_to_, frame_shifts = dfs.evaluate( [e.chrA, posA, e.RNAstrandA], [e.chrB, posB, e.RNAstrandB], 2) else: exons_from_, exons_to_, frame_shifts = dfs.evaluate( [e.chrB, posB, e.RNAstrandB], [e.chrA, posA, e.RNAstrandA], 2) exons_from += exons_from_ exons_to += exons_to_ del (exons_from_, exons_to_) fgd += [ x[0] + '->' + x[1] for x in frame_shifts['fgd'] ] frameshifts_0 += [ x[0][0] + '->' + x[1][0] for x in frame_shifts[0] ] frameshifts_1 += [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[1] ] frameshifts_2 += [ x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] + '(+' + str(x[1][1]) + ')' for x in frame_shifts[2] ] done_breaks.add(params[0]) e.exons_from = sorted(list(set(exons_from))) e.exons_to = sorted(list(set(exons_to))) del (exons_from, exons_to) e.fgd = ','.join(sorted(list(set(fgd)))) e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0)))) e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1)))) e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2)))) del (fgd, frameshifts_0, frameshifts_1, frameshifts_2) if ffs: e.is_on_splice_junction_motif(ffs) if e.x_onic == 'intronic' and e.circ_lin == 'linear': intronic_linear.append(e) else: remainder.append(e) def insert(pos, e): if pos[0][0:3] == 'chr': chrom = pos[0][3:] else: chrom = pos[0] # position_accession = HTSeq.GenomicPosition(pos[0], pos[1], pos[2]) position_accession = HTSeq.GenomicInterval( chrom, pos[1], pos[1] + 1, pos[2]) position = self.idx[position_accession] position += e insert((e.chrA, e.posA, e.strandA), e) insert((e.chrB, e.posB, e.strandB), e) if ffs != None: ffs.close() # Reorder log.info("Re-order and find matching entries") idx2 = {} q = 0 for e in intronic_linear: results_split = [set([]), set([])] positions = [(e.chrA, e.posA, e.strandA), (e.chrB, e.posB, e.strandB)] for pos_i in [0, 1]: pos = positions[pos_i] if pos[2] == '-': pos1 = pos[1] - 200000 pos2 = pos[1] else: pos1 = pos[1] pos2 = pos[1] + 200000 if pos[0][0:3] == 'chr': chrom = pos[0][3:] else: chrom = pos[0] for step in self.idx[HTSeq.GenomicInterval( chrom, max(0, pos1), pos2, pos[2])].steps(): for e2 in [_ for _ in step[1] if _ != e]: if e2.strandA == e.strandA and e2.strandB == e.strandB: results_split[pos_i].add(e2) results = results_split[0].intersection(results_split[1]) top_result = (None, 9999999999999) for r in results: d1 = (r.posA - e.posA) d2 = (r.posB - e.posB) sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2)) shared_score = math.sqrt( (pow(e.score, 2) + pow(r.score, 2)) * 0.5) penalty = 1.0 * sq_d / shared_score if penalty < top_result[1]: top_result = (r, penalty) if top_result[0]: insert_in_index(idx2, [e, top_result[0]], e.score + top_result[0].score, q) else: insert_in_index(idx2, [e], e.score, q) q += 1 for e in remainder: insert_in_index(idx2, [e], e.score, q) q += 1 log.info("Determining fusion gene names and generate output") # Generate output i = 1 exported = set([]) for score in sorted(idx2.keys(), reverse=True): for key in sorted(idx2[score].keys()): added = 0 for entry in idx2[score][key]: if entry not in exported: acceptors_donors = entry.get_donors_acceptors( gene_annotation) line = entry.line[:-5] + [ entry.fgd, entry.frameshift_0, entry.frameshift_1, entry.frameshift_2, entry.edit_dist_to_splice_motif, ",".join( entry.exons_from), ",".join(entry.exons_to) ] + entry.line[-5:] fh_out.write( str(i) + "\t" + acceptors_donors + "\t" + "\t".join(line) + "\n") exported.add(entry) added += 1 if added > 0: i += 1
def main(): parser = argparse.ArgumentParser(description='Download genebank sequence') parser.add_argument('--query', '-q', help="Genebank id for download", required=True) parser.add_argument('--fasta', '-o', help="Full length sequence in genebank", default="Rfam/seeds/CMfinder-set-full-length.fasta") parser.add_argument('--download', '-d', action="store_true", help="whether perform downloading", default=False) args = parser.parse_args() Entrez.email = "*****@*****.**" gbIds = np.unique(open(args.query).read().strip().split("\n")) print("Load downloaded sequences ...") fastaObj = Fasta(args.fasta) downloaded = list(fastaObj.keys()) fastaObj.close() print("Done .") print("Total downloaded sequences: {}".format(len(fastaObj.keys()))) print("{} query sequence".format(gbIds.shape[0])) gbIds = np.setdiff1d(gbIds, downloaded) print("{} query sequence are not downloaded".format(gbIds.shape[0])) N = 0 for gbId in gbIds: if gbId.startswith("URS"): N += 1 print("{} query sequence are in RNAcentral annotation".format(N)) if not args.download: sys.exit(0) fout = open(args.fasta, "a") for gbId in gbIds: try: if gbId.startswith("URS"): continue RNAcentralId, taxo = gbId.strip().split("_") print("Start retriving {} from RNAcentral...".format(gbId), file=sys.stderr) content = requests.get( "https://rnacentral.org/api/v1/rna/{}/{}".format( RNAcentralId, taxo), headers={ "Accept": "application/json" }).text data = json.loads(content) entry = ">" + gbId + " " + data["description"] sequence = data["sequence"] print(entry, file=fout) print(entry) print(sequence, file=fout) print("Done.") else: print( "Start retriving {} from ncbi nucleotide...".format(gbId), file=sys.stderr) handle = Entrez.efetch(db="nucleotide", id=gbId, rettype="fasta", retmode="text") content = handle.read().strip() contents = content.split("\n") entry = contents[0] + "\n" + "".join(contents[1:]) print(entry, file=fout) print("Done.") except: print("Error retriving {}, skip ...".format(gbId), file=sys.stderr) time.sleep(0.5) fout.close() if os.path.exists(args.fasta + ".fai"): os.remove(args.fasta + ".fai")
def get_reference_sequence(CHROM): genome = Fasta(cfg.get("ref", "genome")) sequence_by_chromosome[CHROM] = genome[str(CHROM)][:].seq chromosome_lengths[CHROM] = len(sequence_by_chromosome[CHROM]) genome.close()
class Reference(object): """Interface to the human genome reference file. This class uses ``pyfaidx`` to parse the genome reference file referenced by ``settings.REFERENCE_PATH``. This can only be a single plain fasta file. Also note that if the path is not in the ``~/.gtconfig/gtrc.ini`` file, gepyto will look for an environment variable named ``REFERENCE_PATH``. If the genome file can't be found, this class fallbacks to the Ensembl remote API to get the sequences. This behaviour can also be forced by using the ``remote=True`` argument. """ def __init__(self, remote=False): if not remote: try: self.ref = Fasta(settings.REFERENCE_PATH) except IOError: self.ref = _RemoteReference(settings.BUILD) else: self.ref = _RemoteReference(settings.BUILD) # Add a get method. This will not be sensitive to "chr" prefixes. def get(fasta, chrom): chr_prefix = chrom.startswith("chr") try: return fasta[chrom] except KeyError: pass try: # If there was a prefix, we try without. if chr_prefix: return fasta[chrom[3:]] # If there was no prefix, we try with. else: return fasta["chr{}".format(chrom)] except KeyError: # If it is a true mismatch, we return None. return None self.ref.get = functools.partial(get, self.ref) def check_variant_reference(self, variant, flip=False): """Given a variant, makes sure that the 'ref' allele is consistent with the human genome reference. :param variant: The variant to verify. :type variant: :py:class:`gepyto.structures.variants.Variant` subclass :param flip: If ``True`` incorrect ``(ref, alt)`` pairs will be flipped (Default: False). :type flip: bool :returns: If flip is True, it returns the correct variant or raises a ``ValueError`` in case it is not salvageable. If flip is False, a bool is simply returned. """ type_message = ("Unsupported argument to check_variant_reference. " "A SNP or Indel object has to be provided.") if not (hasattr(variant, "chrom") and hasattr(variant, "pos") and hasattr(variant, "ref") and hasattr(variant, "alt")): raise TypeError(type_message) if (len(variant.ref) == len(variant.alt) == 1 and "-" not in (variant.ref + variant.alt)): return check_snp_reference(variant, self, flip) else: return check_indel_reference(variant, self, flip) def get_nucleotide(self, chrom, pos): """Get the nucleotide at the given genomic position. """ return self.get_sequence(str(chrom), pos, length=1) def get_sequence(self, chrom, start, end=None, length=None): """Get the nucleotide sequence at the given genomic locus. :param chrom: The chromosome. :type chrom: str :param start: The start position of the locus. :type start: int :param end: The end position. :type end: int :param length: The length of the sequence to fetch. :type length: int Either an ``end`` or a ``length`` parameter has to be provided. The ranges are incluse, this means that (start, end) positions will both be included in the sequence. """ if (end is None and length is None) or (end and length): raise TypeError("get_sequence needs either an 'end' OR 'length' " "argument.") if length: end = start + length - 1 try: seq = self.ref[str(chrom)][start - 1:end] except KeyError: seq = None if seq is None: error_message = "chr{}:{}-{} is an invalid genomic mapping" error_message = error_message.format(chrom, start, end) raise InvalidMapping(error_message) return str(seq.seq).upper() def close(self): self.ref.close() def __enter__(self): return self def __exit__(self, *args): self.close()
class Reference(object): """Interface to the human genome reference file. This class uses ``pyfaidx`` to parse the genome reference file referenced by ``settings.REFERENCE_PATH``. This can only be a single plain fasta file. Also note that if the path is not in the ``~/.gtconfig/gtrc.ini`` file, gepyto will look for an environment variable named ``REFERENCE_PATH``. If the genome file can't be found, this class fallbacks to the Ensembl remote API to get the sequences. This behaviour can also be forced by using the ``remote=True`` argument. """ def __init__(self, remote=False): if not remote: try: self.ref = Fasta(settings.REFERENCE_PATH) except IOError: self.ref = _RemoteReference(settings.BUILD) else: self.ref = _RemoteReference(settings.BUILD) # Add a get method. This will not be sensitive to "chr" prefixes. def get(fasta, chrom): chr_prefix = chrom.startswith("chr") try: return fasta[chrom] except KeyError: pass try: # If there was a prefix, we try without. if chr_prefix: return fasta[chrom[3:]] # If there was no prefix, we try with. else: return fasta["chr{}".format(chrom)] except KeyError: # If it is a true mismatch, we return None. return None self.ref.get = functools.partial(get, self.ref) def check_variant_reference(self, variant, flip=False): """Given a variant, makes sure that the 'ref' allele is consistent with the human genome reference. :param variant: The variant to verify. :type variant: :py:class:`gepyto.structures.variants.Variant` subclass :param flip: If ``True`` incorrect ``(ref, alt)`` pairs will be flipped (Default: False). :type flip: bool :returns: If flip is True, it returns the correct variant or raises a ``ValueError`` in case it is not salvageable. If flip is False, a bool is simply returned. """ type_message = ("Unsupported argument to check_variant_reference. " "A SNP or Indel object has to be provided.") if not (hasattr(variant, "chrom") and hasattr(variant, "pos") and hasattr(variant, "ref") and hasattr(variant, "alt")): raise TypeError(type_message) if (len(variant.ref) == len(variant.alt) == 1 and "-" not in (variant.ref + variant.alt)): return check_snp_reference(variant, self, flip) else: return check_indel_reference(variant, self, flip) def get_nucleotide(self, chrom, pos): """Get the nucleotide at the given genomic position. """ return self.get_sequence(str(chrom), pos, length=1) def get_sequence(self, chrom, start, end=None, length=None): """Get the nucleotide sequence at the given genomic locus. :param chrom: The chromosome. :type chrom: str :param start: The start position of the locus. :type start: int :param end: The end position. :type end: int :param length: The length of the sequence to fetch. :type length: int Either an ``end`` or a ``length`` parameter has to be provided. The ranges are incluse, this means that (start, end) positions will both be included in the sequence. """ if (end is None and length is None) or (end and length): raise TypeError("get_sequence needs either an 'end' OR 'length' " "argument.") if length: end = start + length - 1 try: seq = self.ref[str(chrom)][start - 1: end] except KeyError: seq = None if seq is None: error_message = "chr{}:{}-{} is an invalid genomic mapping" error_message = error_message.format(chrom, start, end) raise InvalidMapping(error_message) return str(seq.seq).upper() def close(self): self.ref.close() def __enter__(self): return self def __exit__(self, *args): self.close()
cid_path = args.cid en_path = args.encode if en_path[-1] == '/': en_path = en_path[:-1] if gen_path[-1] == '/': gen_path = gen_path[:-1] if out_path[-1] == '/': out_path = out_path[:-1] if not os.path.exists(out_path): os.mkdir(out_path) # read fa file of gRNAs. fa_gRNA = Fasta(gRNA_path, sequence_always_upper=True) gid = list(fa_gRNA.keys()) gRNAs = [fa_gRNA[i][:].seq for i in gid] fa_gRNA.close() ## get POT list of gRNA based on Cas-OFFinder. cas_input(gen_path, gRNAs, mismatch, gpu) f_gRNA, f_pot = pot(gid, gRNAs) ## encode ots and predict with deepcrispr f_cid = pd.read_csv(cid_path, sep='\t', names=['cid', 'cell']) cid = f_cid.cid[f_cid.cell == cell].tolist()[0] encode(f_gRNA, en_path, cid) f_deep = deepots(f_pot, gpu) ## integrate to igwos igwos(gRNA_path, f_deep, out_path, mismatch) if cp == 1: print("visualize the genome-wide off-target profile with the circos plot") os.system("./circos.sh {0}/igwos.tab {1} {2}".format( out_path, genome, out_path))