def model_region_singletons(data_container, vcf_path, fasta_path, kmer_size, region): start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: if region.strand is not None: if ek.is_dash(region.strand): sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr, flush=True) return region_ref_counts = ek.kmer_search( sequence, kmer_size) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str( region.stop) transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} idx_nuc = list('ACGT') for variant in vcf(r_string): if ek.is_singleton_snv(variant): new_var = Variant(variant=variant, fields=['vep']) # take 7mer around variant. pyfaidx excludes start index and includes end index adj_seq = fasta[str( new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print( 'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 temp = data_container.get() temp.add_kmer_counts(region_ref_counts) temp.add_transition(transitions) data_container.set(temp) print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return
def test_get_seq_rc(self): """ Check get_seq with rc argument """ fa = Fasta('data/chr17.hg19.part.fa') result = fa.get_seq("chr17", 11, 20, rc=False) expect = "CCCTGTTCCT" print("normal") print(result.seq) print(expect) assert result.seq == expect result = fa.get_seq("chr17", 11, 20, rc=True) expect = "AGGAACAGGG" assert result.seq == expect print("rc") print(result.seq) print(expect)
class FastaStringExtractor(BaseExtractor): """Fasta file extractor NOTE: The extractor is not thread-save. If you with to use it with multiprocessing, create a new extractor object in each process. # Arguments fasta_file (str): path to the fasta_file use_strand (bool): if True, the extracted sequence is reverse complemented in case interval.strand == "-" force_upper (bool): Force uppercase output """ def __init__(self, fasta_file, use_strand=False, force_upper=False): from pyfaidx import Fasta self.fasta_file = fasta_file self._use_strand = use_strand self.fasta = Fasta(self.fasta_file) self.force_upper = force_upper def extract(self, interval: Interval, use_strand=None, **kwargs) -> str: """ Returns the FASTA sequence in some given interval as string Args: interval: the interval to query use_strand (bool, optional): if True, the extracted sequence is reverse complemented in case interval.strand == "-". Overrides `self.use_strand` **kwargs: Returns: sequence of requested interval """ # reverse-complement seq the negative strand if use_strand is None: use_strand = self.use_strand rc = use_strand and interval.strand == "-" # pyfaidx wants a 1-based interval seq = str(self.fasta.get_seq( interval.chrom, interval.start + 1, interval.stop, rc=rc ).seq) # optionally, force upper-case letters if self.force_upper: seq = seq.upper() return seq def close(self): return self.fasta.close()
def process_chrom_bin(region, kmer_size, vcf_path, fasta_path, AF=False): start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr) return region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True, count_n=True) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(region.stop) if AF: transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0])) else: transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} # count, singletons = ek.count_regional_variants(vcf(r_string)) for variant in vcf(r_string): if ek.is_singleton_snv(variant): new_var = Variant(variant=variant) # take 7mer around variant. pyfaidx excludes start index and includes end index adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): if AF: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF') else: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0: bin_trans = pd.DataFrame.from_dict(transitions, orient='index') bin_trans.sort_index(inplace=True) # bin_trans['tot'] = bin_trans.sum(axis=1) bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index') bin_kcounts.sort_index(inplace=True) bin_trans['counts'] = bin_kcounts[0] bin_trans['freq'] = bin_trans.apply(row_multinomial) # kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True) # kmer_freq.fillna(0, inplace=True) # kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts bin_trans.loc['GC_content', 'freq'] = gc_content bin_trans.loc['N_count', 'freq'] = n_count print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return region, bin_trans['freq'].to_dict() else: print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return region, None
def check_clinvar(vcf_path, fasta_path, kmer_size, left_context=0, right_context=0, counts_path=None): # VCF is 1-based, closed # BED is 0-based, half-open names = get_autosome_names_grch38() vcf = VCF(vcf_path) fasta = Fasta(fasta_path, read_ahead=10_000_000) window = KmerWindow(kmer_size, counts_path=counts_path) clinvar = [] for variant in vcf: if variant.CHROM not in names: continue start = variant.POS - left_context stop = variant.POS + right_context # start = variant.POS - (kmer_size) # stop = variant.POS + (kmer_size) seq = fasta.get_seq(variant.CHROM, start, stop).seq.upper() clinvar.append((variant.CHROM, variant.POS, variant.INFO.get("CLNSIG"), window.calculate_expected(seq), seq)) return clinvar
def match_seq(rec: pd.Series, sequences: pyfaidx.Fasta) -> pyfaidx.Sequence: """Given a feature in a GTF/GFF read in by gtfparse, match_seq() will extract the corresponding DNA sequence and create a new pyfaidx.Sequence object Parameters ---------- rec : :class:`~pandas.Series` Information for a feature (i.e. gene, exon, etc...). Requires the following indices: strand, gene_name, feature, strand, start, end, seq_hash sequences : :class:`~pyfaidx.Sequence` Object containing sequences to match against the positions in the index. Returns ------- :class:`~pyfaidx.Sequence object` with annotation from `rec` and sequence information from `sequences`. """ try: rev: bool = bool(rec["strand"] == "-") seq = pyfaidx.Sequence( name=f"{rec['gene_name']}_" f"{rec['feature']}_" f"{rec['strand']}_" f"{rec['start']}_" f"{rec['end']}_" f"{rec['seq_hash']}", seq=sequences.get_seq(name=rec["seqname"], start=rec["start"], end=rec["end"], rc=rev).seq, ) return seq except ValueError: print(f"problem with {rec['gene_name']} {rec['start']} " f"{rec['end']} {rec['seqname']} {rec['strand']}")
def model_region_nonsingletons(data_container, vcf_path, fasta_path, kmer_size, region, AC_cutoff): if AC_cutoff is not None: try: AC_cutoff = int(AC_cutoff) except ValueError: AC_cutoff = None print( 'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.', file=sys.stderr, flush=True) try: kmer_size = int(kmer_size) if kmer_size < 1: raise ValueError except ValueError: print('kmer_size must be a positive integer. Please check arguments.', file=sys.stderr, flush=True) exit(1) start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: if region.strand is not None: if ek.is_dash(region.strand): sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr, flush=True) return region_ref_counts = ek.kmer_search( sequence, kmer_size) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str( region.stop) ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} idx_nuc = list('ACGT') for variant in vcf(r_string): if ek.is_quality_snv(variant, AC_cutoff=AC_cutoff): new_var = Variant(variant=variant) adj_seq = fasta[str( new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print( 'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): ac_transitions[adj_seq.upper()][nuc_idx[ new_var.ALT[0]]] += new_var.AC an_transitions[adj_seq.upper()][nuc_idx[ new_var.ALT[0]]] += new_var.AN # if ek.is_singleton_snv(variant): # new_var = Variant(variant=variant, fields=['vep']) # # take 7mer around variant. pyfaidx excludes start index and includes end index # adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq # if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): # print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) # if ek.complete_sequence(adj_seq): # transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 temp = data_container.get() temp.add_kmer_counts(region_ref_counts) temp.add_transition(ac_transitions) temp.add_transition2(an_transitions) data_container.set(temp) print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return
class FastaReader: """Class for reading and querying fasta file.""" def __init__(self, fasta_location): """ Parameters --------- fasta_location : string Path to fasta file """ self.fasta_location = fasta_location try: self.fasta = Fasta(fasta_location, as_raw=True, sequence_always_upper=True) except Exception as e: raise Exception( "Error reading fasta file {} : {}".format( os.path.abspath(self.fasta_location), e ) ) def query(self, intervals): """Query regions for sequence. Parameters ---------- intervals: list of Interval The intervals for fasta is one-based and full-closed Returns ------- sequences: list(str) An array containing scores for each Interval This function is agnostic of the strand information, the position in the scores is corresponding to the interval """ sequences = [] chrom_lengths = self.chromosomes for i in intervals: if i.chrom not in list(chrom_lengths.keys()): warnings.warn( "Chromosome {} does not appear in the fasta".format(i.chrom), UserWarning, ) else: chrom_length = chrom_lengths[i.chrom] if i.start > chrom_length: raise Exception( "Chromsome start point exceeds chromosome length: {}>{}".format( i.start, chrom_length ) ) elif i.end > chrom_length: raise Exception( "Chromsome end point exceeds chromosome length: {}>{}".format( i.end, chrom_length ) ) seq = self.fasta.get_seq(i.chrom, i.start, i.end) sequences.append(seq) return sequences def complement(self, seq): """Complement a FASTA sequence. Parameters ---------- seq: str String fasta sequence Returns ------- complement_seq: str complemenet of input fasta """ complement_letters = {"A": "T", "C": "G", "T": "A", "G": "C"} seq = seq.upper() comp = [] for nuc in seq: if nuc in complement_letters: comp.append(complement_letters[nuc]) else: comp.append(nuc) return "".join(comp) def reverse_complement(self, seq): """Reverse-complment a FASTA sequence. Parameters ---------- seq: str String fasta sequence Returns ------- complement_seq: str complemenet of input fasta """ seq = seq.upper() return self.complement(seq)[::-1] @property def chromosomes(self): """Return list of chromsome and their sizes as in the fasta file. Returns ------- chroms : dict Dictionary with {"chr": "Length"} format .. currentmodule:: .FastaReader .. autosummary:: .FastaReader """ chroms = OrderedDict() for chrom in list(self.fasta.keys()): chroms[chrom] = len(self.fasta[chrom]) return chroms
def query_bed_region(region, vcf_path, fasta, kmer_size, counts_path, count_frequency): """ @param region: @param vcf_path: @param fasta: @param kmer_size: @param counts_path: This field is critical for count_freuency to work. Needs table of expected AF @param count_frequency: @return: """ # TODO: Add binning somehow (either keep equal size or equal number of bins start = time.time() vcf = VCF(vcf_path) fasta = Fasta(fasta) window = KmerWindow(kmer_size, counts_path=counts_path) # The first kmer actually begins centered around first nucleotide in sequence so # start position is shifted upstream by half the kmer length # end position is shifted downstream by the same shift = kmer_size // 2 try: if region.strand is not None: if is_dash(region.strand): sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).complement.seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper() exp = window.calculate_expected( sequence) # this does account for strandedness if count_frequency: AF, AC, AN = count_regional_AF(vcf(str(region))) # if not math.isclose(calc, total, rel_tol=1e-05): # print('WARNING: Calculated AF and VCF AF are different! Calculated AF: %f VCF AF: %f' % ( # calc, total), file=sys.stderr, flush=True) field1 = AC field2 = AN field3 = AF field4 = exp # if exp == 0: # field4 = exp # else: # field4 = exp else: # does not account for strandedness here all_vars, observed_variants = count_regional_variants( vcf(str(region))) field1 = all_vars - observed_variants field2 = observed_variants field3 = exp if exp == 0: field4 = 0 else: field4 = observed_variants / exp except (KeyError, FetchError): field1 = 0 field2 = 0 field3 = 0 field4 = 0 # exp = 0 # observed_variants = 0 # all_vars = 0 # if exp == 0: # ratio = 0 # else: # ratio = observed_variants / exp print('{0:<30} {1:>10} {2:>20} {3:>20} {4:>20}'.format( (region.printstr(delim=' ')), str(field1), str(field2), str(field3), str(field4)), flush=True) return "%s\t%s\t%s\t%s\t%s\n" % (region.printstr(), str(field1), str(field2), str(field3), str(field4))
def process_bed_region(region, kmer_size, vcf_path, fasta_path, AF=False, delim=','): start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: # sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper() if region.strand is not None: if ek.is_dash(region.strand): sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr) return region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True, count_n=True) # nprocs=1 due to short region if AF: transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0])) else: transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} # count, singletons = ek.count_regional_variants(vcf(r_string)) for variant in vcf(region.vcf_str()): if ek.is_singleton_snv(variant): new_var = Variant(variant=variant) # take 7mer around variant. pyfaidx excludes start index and includes end index adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if ek.complete_sequence(adj_seq): if AF: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF') else: transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1 if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0: bin_trans = pd.DataFrame.from_dict(transitions, orient='index') bin_trans.sort_index(inplace=True) bin_trans['tot'] = bin_trans.sum(axis=1) bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index') bin_kcounts.sort_index(inplace=True) bin_kcounts.columns = ['counts'] kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True) kmer_freq.fillna(0, inplace=True) kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts kmer_freq.loc['GC_content', 'freq'] = gc_content kmer_freq.loc['N_count', 'freq'] = n_count kdict = kmer_freq['freq'].to_dict() # kmer_freq.sort_index(inplace=True) # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True) outstring = region.str_name() + delim kkeys = ek.generate_kmers(kmer_size) kkeys.append('GC_content') kkeys.append('N_count') for i, k in enumerate(kkeys): try: outstring = outstring + str(kmer_freq.loc[k, 'freq']) except KeyError: outstring = outstring + '0' if (i + 1) < len(kkeys): outstring = outstring + delim print(outstring, flush=True) # return region, kmer_freq['freq'].to_dict() else: # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True) outstring = region.str_name() + delim for i in range((kmer_size ** 4) + 2): outstring = outstring + '0' if (i + 1) < ((kmer_size ** 4) + 2): outstring = outstring + delim print(outstring, flush=True)
def model_region(datacontainer, vcf_path, fasta_path, kmer_size, region, AC_cutoff): if AC_cutoff is not None: try: AC_cutoff = int(AC_cutoff) except ValueError: AC_cutoff = None print( 'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.', file=sys.stderr, flush=True) try: kmer_size = int(kmer_size) if kmer_size < 1: raise ValueError except ValueError: print('kmer_size must be a positive integer. Please check arguments.', file=sys.stderr, flush=True) exit(1) start = time.time() fasta = Fasta(fasta_path) vcf = VCF(vcf_path) start_idx_offset = int(kmer_size / 2 + 1) kmer_mid_idx = int(start_idx_offset - 1) try: if region.strand is not None: if is_dash(region.strand): sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).complement.seq.upper() else: sequence = fasta.get_seq( region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper() except (KeyError, FetchError): print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr, flush=True) return region_ref_counts = kmer_search(sequence, kmer_size) # nprocs=1 due to short region r_string = str(region.chrom) + ':' + str(region.start) + '-' + str( region.stop) singleton_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0])) af_transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0])) # Define indices for nucleotides nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3} idx_nuc = list('ACGT') for variant in vcf(r_string): if is_quality_snv(variant, AC_cutoff=AC_cutoff): adj_seq = fasta[str( variant.CHROM)][(variant.POS - start_idx_offset):(variant.POS + kmer_mid_idx)].seq if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper(): print( 'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True) if complete_sequence(adj_seq): ac_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += variant.INFO.get('AC') an_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += variant.INFO.get('AN') af_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += variant.INFO.get('AF') if variant.INFO.get('AC') == 1: singleton_transitions[adj_seq.upper()][nuc_idx[ variant.ALT[0]]] += 1 data = { 'singleton': singleton_transitions, 'AC': ac_transitions, 'AN': an_transitions, 'AF': af_transitions } temp = datacontainer.get() temp.add_kmer_counts(region_ref_counts) for k, v in data.items(): temp.add_transition(v, k) datacontainer.set(temp) print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True) return
import os from pybedtools import BedTool from pyfaidx import Fasta import ggplot as gg import pandas as pd ATAC_peaks_regions = BedTool('../islet_ATAC_peaks/GSE76268_RAW/GSM1978246_ACFQ363beta2_.bed.gz') genome = Fasta(os.path.expanduser('~/Desktop/genomes/hg19/chr1.fa.gz.bgz')) print genome.get_seq( 'chr1', int(ATAC_peaks_regions[100][1]), int(ATAC_peaks_regions[100][2]) ) ATAC_peaks_regions_pd = pd.DataFrame(columns = ["start", "end"], index = [i[0] for i in ATAC_peaks_regions]) ATAC_peaks_regions_pd["start"] = [int(i[1]) for i in ATAC_peaks_regions] ATAC_peaks_regions_pd["end"] = [int(i[2]) for i in ATAC_peaks_regions] ATAC_peaks_regions_pd["length"] = ATAC_peaks_regions_pd["start"]-ATAC_peaks_regions_pd["end"] ATAC_peaks_regions_pd.head() p = gg.ggplot(data = ATAC_peaks_regions_pd, aesthetics=gg.aes(x="start", y="length")) + gg.geom_point()
def query_bed_region(region, vcf_path, fasta, kmer_size, singleton_path, af_path, an_path, ac_path, model_dir): """ @param ac_path: @param an_path: @param af_path: @param singleton_path: @param region: @param vcf_path: @param fasta: @param kmer_size: @return: """ # TODO: Add binning somehow (either keep equal size or equal number of bins start = time.time() vcf = VCF(vcf_path) fasta = Fasta(fasta) window = QueryWindow(kmer_size, singleton_path=singleton_path, af_path=af_path, an_path=an_path, ac_path=ac_path, model_dir=model_dir) # The first kmer actually begins centered around first nucleotide in sequence so # start position is shifted upstream by half the kmer length # end position is shifted downstream by the same shift = kmer_size // 2 try: if region.strand is not None: if is_dash(region.strand): sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).complement.seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper() else: sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper() exp = window.calculate_expected(sequence) # this does account for strandedness AF, AC, AN, singletons, count = count_regional_alleles(vcf(str(region))) field1 = count # 'NumSNVs' field2 = singletons # 'Singletons' field3 = AC # 'AC' field4 = AN # 'AN' field5 = AF # 'AF' field6 = exp.get('singleton') # 'ExpectedSingletons' field7 = exp.get('AC') # 'ExpectedAC' field8 = exp.get('AN') # 'ExpectedAN' field9 = exp.get('AF') # 'ExpectedAF' except (KeyError, FetchError): field1 = 0 # 'NumSNVs' field2 = 0 # 'Singletons' field3 = 0 # 'AC' field4 = 0 # 'AN' field5 = 0 # 'AF' field6 = 0 # 'ExpectedSingletons' field7 = 0 # 'ExpectedAC' field8 = 0 # 'ExpectedAN' field9 = 0 # 'ExpectedAF' # print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( # region.printstr(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), # str(field8), str(field9)), flush=True) regname = region.str_name().split('\t') print( '{: <8} {: <12} {: <12} {: <20} {: <8} {: <10} {: <12} {: <10} {: <10} {: <24} {: <22} {: <20} {: <20} {: <20}'.format( str(regname[0]), str(regname[1]), str(regname[2]), str(regname[3]), str(regname[4]), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), str(field8), str(field9)), flush=True) return '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ( region.str_name(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), str(field8), str(field9))
class FastaHandler(object): def __init__(self, fasta_file): self.fasta_file = fasta_file print("Loading fasta file...please wait...") self.fasta_handler = Fasta(fasta_file, sequence_always_upper=True) def get_fasta_seq_by_location(self, chrom, start, end): if self.fasta_handler is not None: seq = self.fasta_handler[chrom][start:end].seq return seq else: raise ValueError("Fasta Handler not initialized") def get_fasta_seq_by_id(self, identifier, start=None, end=None): if self.fasta_handler is not None: if start is not None and end is not None: seq_record = self.fasta_handler.get_seq(identifier, start, end) if seq_record is not None: return seq_record else: try: fasta_record = self.fasta_handler[identifier] len_fasta_record = len(fasta_record) # print("len_fasta_record " + str(len_fasta_record)) seq_record = self.fasta_handler.get_seq( identifier, 1, len_fasta_record) return seq_record.seq except Exception as e: print('Failed to get seq id: ' + str(identifier) + " " + str(e)) return None else: raise ValueError("Fasta seq not found for id " + identifier) def get_sequence_by_id(self, identifier): if self.fasta_handler is not None: try: fasta_record = self.fasta_handler[identifier] len_fasta_record = len(fasta_record) # print("len_fasta_record " + str(len_fasta_record)) seq_record = self.fasta_handler.get_seq( identifier, 1, len_fasta_record) return seq_record.seq except Exception as e: print('Failed to get seq id: ' + str(identifier) + " " + str(e)) return None def get_seq_record_by_id_location(self, identifier, start=None, end=None, strand=None): if self.fasta_handler is not None: if start is not None and end is not None: seq_record = self.fasta_handler.get_seq(identifier, start, end) if seq_record is not None: return seq_record.seq else: try: fasta_record = self.fasta_handler[identifier] len_fasta_record = len(fasta_record) # print("len_fasta_record " + str(len_fasta_record)) seq_record = self.fasta_handler.get_seq( identifier, 1, len_fasta_record) return seq_record.seq except Exception as e: print('Failed to get seq id: ' + str(identifier) + " " + str(e)) return None else: raise ValueError("Fasta seq not found for id " + identifier)