def scaffold_dict_init(self, file, introns): sequences_object = FastaFile(file) dict = {} for i in introns: if (not (i.scaffold_id in dict)): dict[i.scaffold_id] = sequences_object.fetch(i.scaffold_id) return dict
def data_iter(genome_path, label_file): """ Extract 1000 long base pair sequences with corresponding labels. Parameters ---------- genome_path : '.genome.fa' file with genome sequence label_file : '.bed' file with label & location information Returns ------- Generator of extracted 1000 long base pair sequence format: ((region, label), sequence)) """ min_region_size = 1000 genome = FastaFile(genome_path) for region, label in iter_peaks_and_labels(label_file): # create a new region exactly min_region_size basepairs long centered on # region expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2 expanded_stop = expanded_start + min_region_size region = (region[0], expanded_start, expanded_stop) yield ((region, label), genome.fetch(*region)) return
class FastaHandler: """ Handles fasta files using pyfaidx API """ def __init__(self, reference_file_path): """ create fasta file object given file path to a fasta reference file :param fasta_file_path: full path to a fasta reference file """ self.fasta_file_path = reference_file_path try: self.fasta = FastaFile(self.fasta_file_path) except: raise IOError("FASTA FILE READ ERROR") def get_sequence(self, chromosome_name, start, stop): """ Return the sequence of a query region :param chromosome_name: Chromosome name :param start: Region start :param stop: Region end :return: Sequence of the region """ return self.fasta.fetch(region=chromosome_name, start=start, end=stop).upper() def get_chr_sequence_length(self, chromosome_name): """ Get sequence length of a chromosome. This is used for selecting windows of parallel processing. :param chromosome_name: Chromosome name :return: Length of the chromosome reference sequence """ return self.fasta.get_reference_length(chromosome_name)
def extract_fasta_to_file(fasta, output_dir, mode='2D_transpose_bcolz', overwrite=False): assert mode in _array_writer makedirs(output_dir, exist_ok=overwrite) fasta_file = FastaFile(fasta) file_shapes = {} for chrom, size in zip(fasta_file.references, fasta_file.lengths): data = np.zeros((size, NUM_SEQ_CHARS), dtype=np.float32) seq = fasta_file.fetch(chrom) one_hot_encode_sequence(seq, data) shape = data.shape shape_transpose = shape[::-1] file_shapes[chrom] = shape_transpose _array_writer[mode](data, os.path.join( output_dir, chrom)) #We have the metadata shape to be the transposed shape with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp: json.dump( { 'file_shapes': file_shapes, 'type': 'array_{}'.format(mode), 'source': fasta }, fp)
def __init__(self, ref_fa_path=None, vcf_path=None, idx_path=None, batch_size=32, bin_size=100, tie='r'): ''' :param str ref_fa_path: Path to indexed reference fasta :param str vcf_path: Path to indexed vcf :param str idx_path: Path to bed-file which will contain the names and locations of compatible variants :param int batch_size: Batch size :param int bin_size: Length of the DNA-sequences (centered on the start position of the variant) ''' self.vcf = VariantFile(vcf_path) self.ref = FastaFile(ref_fa_path) assert os.path.isfile( ref_fa_path + '.fai'), 'Error: no index found for Fasta-file: {}'.format( ref_fa_path) self.idx_path = idx_path self.batch_size = batch_size self.bin_size = bin_size assert tie in ['l', 'r'] self.tie = tie if not bin_size % 2: self.offset = 0 if tie == 'r' else 1 else: self.offset = 0 self.n_variants = self._initialize_index() self._verify_refmatch()
def load_sequences_and_labels(regions_fname, genome_fa_fname, balanced): seqs, labels = [], [] min_region_size = 1000 genome = FastaFile(genome_fa_fname) for region, label in iter_peaks_and_labels(sys.argv[1]): # create a new region exactly min_region_size basepairs long centered on # region expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2 expanded_stop = expanded_start + min_region_size region = (region[0], expanded_start, expanded_stop) seqs.append(genome.fetch(*region)) if label == 'promoter': labels.append(1) elif label == 'enhancer': labels.append(0) else: assert False # crew code begin to balance data if balanced: sequences = pd.DataFrame(seqs) sequences['Labels'] = pd.Series(labels) p_seqs = sequences[sequences['Labels'].isin([1])] p_seqs.index = range(len(p_seqs)) e_seqs = sequences[sequences['Labels'].isin([0])] e_seqs.index = range(len(e_seqs)) p_seqs_sample = p_seqs.sample(len(e_seqs)) balanced_seqs = p_seqs_sample.append(e_seqs) balanced_seqs.index = range(len(balanced_seqs)) shuffled_balanced_seqs = balanced_seqs.reindex(np.random.permutation(balanced_seqs.index)) shuffled_balanced_seqs.index = range(len(shuffled_balanced_seqs)) return one_hot_encode_sequences(shuffled_balanced_seqs.iloc[:,0].as_matrix())[:,None,:,:],np.array(shuffled_balanced_seqs['Labels'].as_matrix()) return one_hot_encode_sequences(seqs)[:,None,:,:],np.array(labels)
def shotgun_library(fasta_file, mu, sigma, direction=(1, -1)): """Generate random fragment sequences of a given input sequence :param seq: input sequence. :param mu: mean fragment length. :param sigma: stdv of fragment length. :param direction: tuple represention direction of output sequences with respect to the input sequence. :yields: sequence fragments. .. note:: Could be made more efficient using buffers for random samples and handling cases separately. """ fasta = FastaFile(fasta_file) seq_lens = [fasta.get_reference_length(x) for x in fasta.references] total_len = sum(seq_lens) seq_probs = [x / total_len for x in seq_lens] # FastaFile.fetch is proper slow, just read everything refs = fasta.references fasta = {k: fasta.fetch(k) for k in refs} def random_buffer(probs, size=10000): while True: buf = [] for x, n in zip(range(len(probs)), np.random.multinomial(size, probs)): buf.extend([x] * n) np.random.shuffle(buf) for x in buf: yield x seq_chooser = random_buffer(seq_probs) # parameters for lognormal mean = np.log(mu / np.sqrt(1 + sigma**2 / mu**2)) stdv = np.sqrt(np.log(1 + sigma**2 / mu**2)) while True: # choose a seq based on length seq_i = next(seq_chooser) seq = fasta[refs[seq_i]] seq_len = seq_lens[seq_i] start = np.random.randint(0, seq_len) frag_length = int(np.random.lognormal(mean, stdv)) move = np.random.choice(direction) end = max(0, start + move * frag_length) start, end = sorted([start, end]) if end - start < 2: # Expand a bit to ensure we grab at least one base. start = max(0, start - 1) end += 1 frag_seq = seq[start:end] if move == -1: frag_seq = reverse_complement(frag_seq) yield frag_seq, refs[seq_i], start, end, '+' if move == 1 else '-'
class FastaHandler: """ Handles fasta files using pyfaidx API """ def __init__(self, reference_file_path): """ create fasta file object given file path to a fasta reference file :param fasta_file_path: full path to a fasta reference file """ self.fasta_file_path = reference_file_path try: self.fasta = FastaFile(self.fasta_file_path) except: raise IOError("FASTA FILE READ ERROR") def get_sequence(self, chromosome_name, start, stop): """ Return the sequence of a query region :param chromosome_name: Chromosome name :param start: Region start :param stop: Region end :return: Sequence of the region """ return self.fasta.fetch(region=chromosome_name, start=start, end=stop).upper() def get_chr_sequence_length(self, chromosome_name): """ Get sequence length of a chromosome. This is used for selecting windows of parallel processing. :param chromosome_name: Chromosome name :return: Length of the chromosome reference sequence """ return self.fasta.get_reference_length(chromosome_name) def get_contig_names(self): return self.fasta.references def get_ref_of_region(self, contig, site): """ Return a string containing reference of a site :param contig: Contig [ex chr3] :param site: Site [ex 100000-200000] :return: """ ret_val = "" error_val = 0 try: ret_val = self.fasta.fetch(region=contig + site).upper() except: print("ERROR IN REF FETCH: ", contig, site) error_val = 1 return ret_val, error_val def close(self): self.fasta.close()
def fasta_extract_exons(fasta_file, database_file, output, raw=False): start = time.time() if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta_file = g2g_fu.check_file(fasta_file) fasta = FastaFile(fasta_file) database_file = g2g_fu.check_file(database_file) fasta_out = sys.stdout if output: output = g2g_fu.check_file(output, 'w') fasta_out = open(output, "w") LOG.info("FASTA FILE: {0}".format(fasta.filename)) LOG.info("DATABASE FILE: {0}".format(database_file)) LOG.info("OUTPUT FILE: {0}".format(fasta_out.name)) try: transcripts = get_transcripts_simple(database_file) for i, transcript in enumerate(transcripts): if transcript.seqid not in fasta.references: continue for ensembl_id, exon in transcript.exons.iteritems(): LOG.debug("Exon={0}".format(exon)) partial_seq = fasta.fetch(exon.seqid, exon.start-1, exon.end) partial_seq_str = partial_seq if transcript.strand == -1: partial_seq_str = str(reverse_complement_sequence(partial_seq)) LOG.debug("{0}:{1}-{2} (Length: {3})\n{4}".format(exon.seqid, exon.start, exon.end, len(partial_seq), partial_seq_str)) if raw: fasta_out.write(partial_seq_str) else: fasta_id = ">{0} {1}:{2}-{3}\n".format(exon.ensembl_id, exon.seqid, exon.start, exon.end) fasta_out.write(fasta_id) for line in wrap_sequence(partial_seq_str): fasta_out.write(line.strip()) fasta_out.write('\n') except G2GValueError as e: LOG.info(e.msg.rstrip()) raise e except G2GFastaError as e: LOG.info(e.msg.rstrip()) raise e LOG.info("Execution complete: {0}".format(format_time(start, time.time())))
def count_callable(callable_file, chrom, start=None, end=None): callable_file = FastaFile(callable_file) seq = callable_file.fetch(chrom, start, end) chrom_length = len(seq) callable_sites = seq.count('0') return chrom_length, callable_sites
def _extract(self, intervals, out, **kwargs): fasta = FastaFile(self._datafile) for index, interval in enumerate(intervals): seq = fasta.fetch(str(interval.chrom), interval.start, interval.stop) out[index, :, :, 0] = one_hot_encode_sequence(seq) return out
class IndexedFasta(DataSource): name = "indexed_bedfile" version = "0.1.0" container = "python" partition_access = True description = "A bgzipped and indexed fasta file" def __init__(self, urlpath, metadata=None): self._urlpath = urlpath self._dataset = None self._dtype = None self._chroms = None super().__init__(metadata=metadata) def _open_dataset(self): self._dataset = FastaFile(self._urlpath) def _get_schema(self): if self._dataset is None: self._open_dataset() self._chroms = list(self._dataset.references) chrom_lengths = [{ "chrom": t[0], "length": t[1] } for t in zip(self._dataset.references, self._dataset.lengths)] return Schema( datashape=None, dtype=None, shape=None, npartitions=len(self._chroms), extra_metadata={"chroms": chrom_lengths}, ) def _get_partition(self, i): chrom = self._chroms[i] return [{"seqid": chrom, "seq": self._dataset.fetch(chrom)}] def read_chunked(self): self._load_metadata() for i in range(self.npartitions): yield self._get_partition(i) def to_dask(self): from dask import bag as db self._load_metadata() return db.from_delayed([ dask.delayed(self._get_partition(i)) for i in range(self.npartitions) ]) def _close(self): # close any files, sockets, etc if self._dataset is not None: self._dataset.close()
def shotgun_library(fasta_file, mu, sigma, direction=(1,-1)): """Generate random fragment sequences of a given input sequence :param seq: input sequence. :param mu: mean fragment length. :param sigma: stdv of fragment length. :param direction: tuple represention direction of output sequences with respect to the input sequence. :yields: sequence fragments. .. note:: Could be made more efficient using buffers for random samples and handling cases separately. """ fasta = FastaFile(fasta_file) seq_lens = [fasta.get_reference_length(x) for x in fasta.references] total_len = sum(seq_lens) seq_probs = [x / total_len for x in seq_lens] # FastaFile.fetch is proper slow, just read everything refs = fasta.references fasta = {k:fasta.fetch(k) for k in refs} def random_buffer(probs, size=10000): while True: buf = [] for x, n in zip(range(len(probs)), np.random.multinomial(size, probs)): buf.extend([x]*n) np.random.shuffle(buf) for x in buf: yield x seq_chooser = random_buffer(seq_probs) while True: # choose a seq based on length seq_i = next(seq_chooser) seq = fasta[refs[seq_i]] seq_len = seq_lens[seq_i] start = np.random.randint(0, seq_len) frag_length = int(np.random.normal(mu, sigma)) move = np.random.choice(direction) end = max(0, start + move*frag_length) start, end = sorted([start, end]) if end - start < 2: # Expand a bit to ensure we grab at least one base. start = max(0, start - 1) end += 1 frag_seq = seq[start:end] if move == -1: frag_seq = reverse_complement(frag_seq) yield frag_seq
def _chrom_sizes(fasta_file): """Get the chromosome sizes for a fasta file """ from pysam import FastaFile fa = FastaFile(fasta_file) chrom_lens = OrderedDict([(name, l) for name, l in zip(fa.references, fa.lengths)]) if len(chrom_lens) == 0: raise ValueError(f"no chromosomes found in fasta file: {fasta_file}. " "Make sure the file path is correct and that the fasta index " "file {fasta_file}.fai is up to date") fa.close() return chrom_lens
def __init__(self, reference_file_path): """ create fasta file object given file path to a fasta reference file :param fasta_file_path: full path to a fasta reference file """ self.fasta_file_path = reference_file_path try: self.fasta = FastaFile(self.fasta_file_path) except: raise IOError("FASTA FILE READ ERROR")
def main(): min_region_size = 1000 genome = FastaFile("GRCh38.genome.fa") for region, label in iter_peaks_and_labels(sys.argv[1]): # create a new region exactly min_region_size basepairs long centered on # region expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2 expanded_stop = expanded_start + min_region_size region = (region[0], expanded_start, expanded_stop) print region, label print genome.fetch(*region) return
def generate_homopolymer_plots(bed_file, fasta_file, bam_file): bed_file_records = open(bed_file, 'r') for line in bed_file_records: contig, start_pos, end_pos = line.rstrip().split('\t') start_pos = int(start_pos) end_pos = int(end_pos) if start_pos < 1000: continue if end_pos - start_pos > 50: continue samfile = pysam.AlignmentFile(bam_file, "rb") assembly_fasta_file = FastaFile(fasta_file) reference_sequence = assembly_fasta_file.fetch(reference=contig, start=start_pos, end=start_pos + 200) reference_homopolymer_index_start = 1 reference_homopolymer_index_end = 1 homopolymer_base = reference_sequence[reference_homopolymer_index_start] # print(homopolymer_base) while reference_homopolymer_index_end < len(reference_sequence) and reference_sequence[reference_homopolymer_index_end] == homopolymer_base: reference_homopolymer_index_end += 1 # print(reference_sequence[reference_homopolymer_index_start:reference_homopolymer_index_end]) reference_homopolymer_length = reference_homopolymer_index_end - reference_homopolymer_index_start all_reads = samfile.fetch(contig, start_pos - 1, end_pos) read_homopolymers = [] for read in all_reads: aligned_pairs = read.get_aligned_pairs() start_index = 0 for index, position in aligned_pairs: if index is None: continue if position == start_pos: start_index = index + 1 break if read.query_sequence is None: continue if start_index == len(read.query_sequence): continue homopolymer_base = read.query_sequence[start_index] # print(homopolymer_base) end_index = start_index while end_index < len(read.query_sequence) and read.query_sequence[end_index] == homopolymer_base: end_index += 1 read_homopolymer_length = end_index - start_index read_homopolymers.append(read_homopolymer_length) print(contig + "\t" + str(start_pos) + "\t" + str(end_pos) + "\t" + str(reference_homopolymer_length) + "\t" + str(','.join([str(x) for x in read_homopolymers])))
def __init__(self, datafile, use_strand=False, **kwargs): """Fasta file extractor NOTE: The extractor is not thread-save. If you with to use it with multiprocessing, create a new extractor object in each process. Args: datafile (str): path to the bigwig file use_strand (bool): if True, the extracted sequence is reverse complemented in case interval.strand == "-" """ super(FastaExtractor, self).__init__(datafile, **kwargs) self.use_strand = use_strand self.fasta = FastaFile(self._datafile)
def close(self): if self._fh: self._fh.close() self._fh = None subprocess.check_call([self._bgzip_exe, "--force", self._basepath]) os.rename(self._basepath + ".gz", self.filename) # open file with FastaFile to create indexes, then make all read-only _fh = FastaFile(self.filename) _fh.close() os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH) logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
def main(): min_region_size = 1000 genome = FastaFile("./genome/GRCh38.genome.fa") train_path = "./train_data/" list_dir = os.listdir(train_path) for filename in list_dir: for region, label in iter_peaks_and_labels(train_path + filename): # create a new region exactly min_region_size basepairs long centered on # region expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2 expanded_stop = expanded_start + min_region_size region = (region[0], expanded_start, expanded_stop) #print region, label print genome.fetch(*region), label return
def data_generator_pysam(my_args, name, start, stop, is_bulk): fasta_file = FastaFile(my_args.fasta) ref = fasta_file.fetch(name, start, stop) my_arg = { 'fastafile': fasta_file, 'stepper': 'samtools', 'adjust_capq_threshold': 50, 'contig': name, 'start': start, 'stop': stop, 'min_mapping_quality': 0 if is_bulk else 20, 'min_base_quality': 13, } if is_bulk: bam_file = AlignmentFile(my_args.bulk, 'rb') else: bam_file = AlignmentFile(my_args.bam, 'rb') read_bases_list = [] for pileup_column in bam_file.pileup(**my_arg): pos = pileup_column.reference_pos if pos >= stop: break if pos < start: continue read_bases_list = pileup_column.get_query_sequences(mark_matches=True, mark_ends=True, add_indels=True) read_bases = ''.join(read_bases_list).upper() n = pileup_column.get_num_aligned() if n == 0: read_bases = '*' base_q = '*' map_q = '*' else: base_q = ''.join([chr(int(i) + PHREDSCORE) \ for i in pileup_column.get_query_qualities()]) map_q = ''.join([chr(int(i) + PHREDSCORE) \ for i in pileup_column.get_mapping_qualities()]) yield [name, pos, ref[pos - start], str(n), read_bases, base_q, map_q] yield None
class FastaExtractor(BaseExtractor): def __init__(self, datafile, use_strand=False, **kwargs): """Fasta file extractor NOTE: The extractor is not thread-save. If you with to use it with multiprocessing, create a new extractor object in each process. Args: datafile (str): path to the bigwig file use_strand (bool): if True, the extracted sequence is reverse complemented in case interval.strand == "-" """ super(FastaExtractor, self).__init__(datafile, **kwargs) self.use_strand = use_strand self.fasta = FastaFile(self._datafile) def _extract(self, intervals, out, **kwargs): for index, interval in enumerate(intervals): seq = self.fasta.fetch(str(interval.chrom), interval.start, interval.stop) one_hot_encode_sequence(seq, out[index, :, :]) # reverse-complement seq the negative strand if self.use_strand and interval.strand == "-": out[index, :, :] = out[index, ::-1, ::-1] return out @staticmethod def _get_output_shape(num_intervals, width): return (num_intervals, width, NUM_SEQ_CHARS)
def main(): min_region_size = 1000 genome = FastaFile("./genome/GRCh38.genome.fa") train_path = "./train_data/" list_dir = os.listdir(train_path) for filename in list_dir: for region, label in iter_peaks_and_labels(train_path + filename): # create a new region exactly min_region_size basepairs long centered on # region expanded_start = region[1] + (region[2] - region[1]) / 2 - min_region_size / 2 expanded_stop = expanded_start + min_region_size region = (region[0], expanded_start, expanded_stop) #print region, label print genome.fetch(*region), label return
def split_variants_to_files(vcf_file, genome_file, bi_file, multi_file): vcf = VariantFile(vcf_file) genome = FastaFile(genome_file) vcf.header.add_line('##INFO=<ID=multi,Number=0,Type=Flag,' 'Description="Variant with multiple allele">') vcf.header.add_line('##INFO=<ID=duplicated,Number=0,Type=Flag,' 'Description="Duplicated in position">') vcf.header.formats.add('GT', 1, 'String', "Genotype") vcf.header.add_sample("Genotype") with open(bi_file, 'wt') as outbi: with open(multi_file, 'wt') as outmu: outbi.write(str(vcf.header)) outmu.write(str(vcf.header)) for multi_alleles, duplicated, record in iter_wanted_variants( vcf, genome): record = record_to_string(record) + ['GT', '0/1'] record = '\t'.join(record) if duplicated: continue if multi_alleles: outmu.write(record + '\n') else: outbi.write(record + '\n')
def initWorker(localWindowSize, fastaFile, k, N, M): global FA, windowSize, kSize, useN, method windowSize = localWindowSize FA = FastaFile(fastaFile) kSize = k useN = N method = M
def method2(basefl): fa = FLAGS.input + ".feature.fa" loader = FastaFile(fa) fl1 = FLAGS.input + ".feature.tsv" output = open("%s/20bp.fa" % (basefl), "w") for i in open(fl1, "r"): ele = i.rstrip().split() ids, pos = ele[0].split("|")[:-1] pos = int(pos) try: seq = loader.fetch(ids, pos - 30, pos + 30) output.write(">%s|%s\n%s\n" % (ids, pos, seq)) except: print("ids %s %s,error" % (ids, pos)) output.close() align_hisat2()
def _chrom_names(fasta_file): """Get the list of chromosome names from a fasta file """ from pysam import FastaFile with FastaFile(fasta_file) as fa: chroms = list(fa.references) return chroms
def split_variants(vcf_file, genome_file): vcf = VariantFile(vcf_file) genome = FastaFile(genome_file) vcf.header.add_line('##INFO=<ID=multi,Number=0,Type=Flag,' 'Description="Variant with multiple allele">') vcf.header.add_line('##INFO=<ID=duplicated,Number=0,Type=Flag,' 'Description="Duplicated in position">') vcf.header.formats.add('GT', 1, 'String', "Genotype") vcf.header.add_sample("Genotype") print(vcf.header, end='') for multi_alleles, duplicated, record in iter_wanted_variants(vcf, genome): record = record_to_string(record) + ['GT', '0/1'] if multi_alleles: add = "multi" if record[6] else ";multi" record[6] += add if duplicated: add = "duplicated" if record[6] else ";duplicated" record[6] += add record = '\t'.join(record) print(record)
def main(): genome_fname = sys.argv[1] regions_fname = sys.argv[2] genome = FastaFile(genome_fname) print "Loaded genome" motifs = load_all_motifs() tfname_id_map = load_tfname_tfid_mapping() print "Loaded Motifs" with open(regions_fname) as fp: regions = load_regions_in_bed(fp) print "Loaded regions" with open(os.path.basename(regions_fname)+".peaks.txt", "w") as ofp: ofp.write("\t".join(["region".ljust(30),] + [ motif.name for motif in motifs]) +"\n") ofp.write("\t".join(["region".ljust(30),] + [ motif.factor for motif in motifs]) +"\n") for i, region in enumerate(regions): print i, region overlapping_peaks = load_overlapping_peaks(*region) motif_overlap_scores = [] for motif in motifs: if motif.factor in overlapping_peaks: motif_overlap_scores.append(1) else: motif_overlap_scores.append(0) ofp.write("%s\t%s\n" % ( "_".join(map(str, region)).ljust(30), "\t".join("%i" % motif_overlap for motif_overlap in motif_overlap_scores))) print "Finished building peak overlap matrix" with open(os.path.basename(regions_fname) + ".TFscores.txt", "w") as ofp: ofp.write("\t".join(["region".ljust(30),] + [ motif.name for motif in motifs]) +"\n") ofp.write("\t".join(["region".ljust(30),] + [ motif.factor for motif in motifs]) +"\n") for i, region in enumerate(regions): if i%100 == 0: print i, len(regions), os.path.basename(regions_fname) seq = genome.fetch(*region).upper() try: scores = score_region(motifs, seq) except: continue ofp.write("%s\t%s\n" % ( "_".join(map(str, region)).ljust(30), "\t".join("%.4f" % score for score in scores))) print "Finished building score matrix"
def extract_fasta_to_file(fasta, output_dir, overwrite): """ Returns compressed version of fasta file for a quickly accessible memory map Args: fasta: fasta file to be converted output_dir: output directory for memory map location overwrite: boolean - whether to overwrite current memory map """ for i in [0, 1]: if overwrite: if not os.path.exists(output_dir): os.makedirs(output_dir) fasta_file = FastaFile(fasta) file_shapes = {} for chrom, size in zip(fasta_file.references, fasta_file.lengths): seq = fasta_file.fetch(chrom) data = one_hot_encode_sequence(seq) file_shapes[chrom] = data.shape bcolz.carray(data, rootdir=os.path.join(output_dir, chrom), cparams=_blosc_params, mode='w').flush() mode = '2D_transpose_bcolz' metadata = { 'file_shapes': file_shapes, 'type': 'array_{}'.format(mode), 'extractor': 'CompressedFastaExtractor', 'source': fasta } with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp: json.dump(metadata, fp) overwrite = False else: try: with open(os.path.join(output_dir, 'metadata.json'), 'r') as fp: metadata = json.load(fp) break except IOError as e: print("I/O error({0}): {1} for {2}".format( e.errno, e.strerror, output_dir)) print( "There is a problem with opening the metadata. Recreating the mmap files and overwriting..." ) overwrite = True return metadata
def get_contig_list_from_fasta(fasta_path, with_length=False): """Obtain list of contigs froma fasta file, all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO Args: fasta_path (str or pysam.FastaFile) : Path or handle to fasta file with_length(bool): return list of lengths Returns: contig_list (list ) : List of contigs + ['MISC_ALT_CONTIGS_SCMO'] if any alt contig is present in the fasta file """ contig_list = [] has_alt = False if with_length: lens = [] if type(fasta_path) is str: fa = FastaFile(fasta_path) elif type(fasta_path) is FastaFile: fa = fasta_path else: raise TypeError('Supply pysam.FastaFile or str') for reference, length in zip(fa.references, fa.lengths): if is_main_chromosome(reference): contig_list.append(reference) if with_length: lens.append(length) else: has_alt = True # Close handle if we just opened one if type(fasta_path) is str: fa.close() if has_alt: contig_list.append('MISC_ALT_CONTIGS_SCMO') if with_length: lens.append(None) if with_length: return contig_list, lens return contig_list
def load_sequences_and_labels(regions_fname, genome_fa_fname): seqs, labels = [], [] min_region_size = 1000 genome = FastaFile(genome_fa_fname) for region, label in iter_peaks_and_labels(sys.argv[1]): # create a new region exactly min_region_size basepairs long centered on # region expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2 expanded_stop = expanded_start + min_region_size region = (region[0], expanded_start, expanded_stop) seqs.append(genome.fetch(*region)) if label == 'promoter': labels.append(1) elif label == 'enhancer': labels.append(0) else: assert False return one_hot_encode_sequences(seqs)[:,None,:,:], np.array(labels)
def extract_fasta_to_npy(fasta, output_dir): fasta_file = FastaFile(fasta) file_shapes = {} for chrom, size in zip(fasta_file.references, fasta_file.lengths): data = np.empty((NUM_SEQ_CHARS, size), dtype=np.float32) seq = fasta_file.fetch(chrom) one_hot_encode_sequence(seq, data) np.save('{}.npy'.format(os.path.join(output_dir, chrom)), data) file_shapes[chrom] = data.shape with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp: json.dump( { 'file_shapes': file_shapes, 'type': 'array', 'source': fasta }, fp)
def __init__(self, reference_file_path): """ create fasta file object given file path to a fasta reference file :param fasta_file_path: full path to a fasta reference file """ self.fasta_file_path = reference_file_path assert os.path.exists( reference_file_path), "Reference path does not exist: {}".format( reference_file_path) try: self.fasta = FastaFile(self.fasta_file_path) except Exception as e: print(e) raise IOError( "Fasta File Read Error: Try indexing reference with 'samtools faidx {}'" .format(reference_file_path))
def generate_header(reference_fa: str, tag: str) -> VariantHeader: """ Generates the header for the minimal VCF. :param reference_fa: Path to reference fasta file. :param tag: The filter tag to use. """ header = VariantHeader() header.filters.add(tag, None, None, "Failed dToxoG") fasta = FastaFile(reference_fa) try: for contig in fasta.references: header.contigs.add(contig, length=fasta.get_reference_length(contig)) finally: fasta.close() return header
def extract_seq(interval, variant, fasta_file, one_hot=False): """ Note: in case the variant is an indel, the anchorpoint at the beginning is used Args: interval: pybedtools.Interval where to extract the sequence from variant: Variant class with attributes: chr, pos, ref, alt fasta_file: file path or pysam.FastaFile instance one_hot: if True, one-hot-encode the output sequence Returns: sequence """ if isinstance(fasta_file, str): from pysam import FastaFile fasta_file = FastaFile(fasta_file) if variant is not None and variant.pos - 1 >= interval.start and variant.pos <= interval.stop: inside = True lendiff = len(variant.alt) - len(variant.ref) else: inside = False lendiff = 0 seq = fasta_file.fetch(str(interval.chrom), interval.start, interval.stop - lendiff) if not inside: out = seq else: # now, mutate the sequence pos = variant.pos - interval.start - 1 expect_ref = seq[pos:(pos + len(variant.ref))] if expect_ref != variant.ref: raise ValueError( f"Expected reference: {expect_ref}, observed reference: {variant.ref}" ) # Anchor at the beginning out = seq[:pos] + variant.alt + seq[(pos + len(variant.ref)):] assert len( out ) == interval.stop - interval.start # sequece length has to be correct at the end if one_hot: out = encodeDNA([out.upper()])[0] return out
def main(args): sample_name = extract_sample_name(args.input_path) with open(args.input_path) as cnv_input, FastaFile(args.genome_ref) as genome_ref,\ open(args.output_path, 'w') as vcf_output: is_full_chrom_name = genome_ref.references[0].startswith('chr') cnv_reader = csv.DictReader(cnv_input, delimiter='\t') vcf_output.write('\n'.join(get_vcf_headers(sample_name, genome_ref)) + '\n') for cnv_line in cnv_reader: vcf_line = get_vcf_line(cnv_line, genome_ref, is_full_chrom_name) vcf_output.write(vcf_line + '\n')
def __call__(self, intervals, to_mirror=None, **kwargs): NUM_SEQ_CHARS = 4 fasta = FastaFile(self._datafile) width = intervals[0].stop - intervals[0].start data = np.zeros((len(intervals), 1, NUM_SEQ_CHARS, width)) for index, interval in enumerate(intervals): seq = fasta.fetch(str(interval.chrom), interval.start, interval.stop) one_hot_encode_sequence(seq, data[index, 0, :, :]) # This is performing a reverse complement operation if to_mirror is not None: for index, mirror in enumerate(to_mirror): if mirror: data[index, :, :, :] = data[index, :, ::-1, ::-1] return data
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False): """ :param input_file: :param fasta_file: :param strain: :param output_file: :param vcf_keep: :param passed: :param quality: :param diploid: :return: """ start = time.time() input_file = g2g_fu.check_file(input_file) fasta_file = g2g_fu.check_file(fasta_file) if not strain: raise G2GValueError("No strain was specified.") output_file = g2g_fu.check_file(output_file, 'w') output_file_dir = os.path.dirname(output_file) LOG.info("VCF FILE: {0}".format(input_file)) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(output_file)) vcf_discard_file = None if vcf_keep: vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file)) vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file) LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file)) LOG.info("STRAIN: {0}".format(strain)) LOG.info("PASS FILTER ON: {0}".format(str(passed))) LOG.info("QUALITY FILTER ON: {0}".format(str(quality))) LOG.info("DIPLOID: {0}".format(str(diploid))) if not isinstance(fasta_file, FastaFile): fasta_file = FastaFile(fasta_file) tb = TabixFile(input_file) sample_index = None for h in tb.header: if h[:6] == '#CHROM': try: elems = h.split('\t') samples = elems[9:] samples = dict(zip(samples, (x for x in xrange(len(samples))))) sample_index = samples[strain] except KeyError, ke: raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
def main(): min_region_size = 1000 cell_type = "combination" genome = FastaFile("/srv/scratch/zho/GRCh38.genome.fa") k = 8 #k in kmer -- we choose 6 sequence_list = [] labels_list = [] attributes_map = get_attributes_map(['A','C','G','T'],k) for region, label in iter_peaks_and_labels(sys.argv[1]): # create a new region exactly min_region_size basepairs long centered on # region expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2 if expanded_start < 0: expanded_start = 0 expanded_stop = expanded_start + min_region_size region = (region[0], expanded_start, expanded_stop) print region, label # note: 1 = promoter, 0 = enhancer if label == "promoter": labels_list.append(1) else: labels_list.append(0) print genome.fetch(*region) sequence_list.append(genome.fetch(*region)) sequence_series=pd.Series(sequence_list) X = createAttributeMatrix(sequence_series, k, attributes_map) X.to_csv("/srv/scratch/zho/" + str(min_region_size) + "_" + cell_type + "_" + str(k) + "mer_train_matrix.csv"); labels_series = pd.Series(labels_list) labels_series.to_csv("/srv/scratch/zho/" + str(min_region_size) + "_" + cell_type + "_" + str(k) + "mer_output_vector.csv"); return
def test_vutil_homoRunForOneVariant(): assert_equal(vutil._calHrunSize('tcggg'), 0) assert_equal(vutil._calHrunSize('ttcggg'), 2) assert_equal(vutil._calHrunSize('AATTGAGACTACAGAGCAAC'), 2) assert_equal(vutil._calHrunSize('ACTCACAGGTTTTATAAAAC'[::-1]), 0) fa = FastaFile('tests/data/ex1.fa') vcf_readers = vcreader(['tests/data/ex1.vcf.gz']) varlist = vcf_readers.variants(chrom = 'chr1', nosnp = False) vutil.homoRunForOneVariant(fa, varlist[0]) varlist = vcf_readers.variants(chrom = 'chr2', nosnp = False) assert_equal(784, varlist[2].POS) assert_equal('ACTCACAGGTTTTATAAAAC', fa.fetch('chr2', varlist[2].POS - 20, varlist[2].POS)) assert_equal('AATTGAGACTACAGAGCAAC', fa.fetch('chr2', varlist[2].POS, varlist[2].POS + 20)) assert_equal('ACTCACAGGTTTTATAAAACAATTGAGACTACAGAGCAAC', fa.fetch('chr2', varlist[2].POS - 20, varlist[2].POS + 20)) hr = vutil.homoRunForOneVariant(fa, varlist[2]) assert_equal(2, hr) varlist[2].POS = fa.get_reference_length('chr2') hr = vutil.homoRunForOneVariant(fa, varlist[2]) assert_equal(0, hr)
class FabgzReader(object): def __init__(self, filename): self._fh = FastaFile(filename) def fetch(self, seq_id, start=None, end=None): return self._fh.fetch(seq_id.encode("ascii"), start, end) def keys(self): return self._fh.references def __len__(self): return self._fh.nreferences def __getitem__(self, ac): return self.fetch(ac) @property def filename(self): return self._fh.filename
ac_dict = {} af_dict = {} sites_dict = {} ac_list = [] af_list = [] sites_list = [] if args.bed[-3:] == '.gz': bed_file = gzip.open(args.bed, 'r') elif args.bed[-4:] == '.bed': bed_file = open(args.bed, 'r') else: sys.exit("\nIs this a bed file? Is it compressed?\n") callable_f = FastaFile(args.callable) for rec in bed_file: col = rec.split() if ';' in col[4]: feature_name = get_feature_name(col[4]) else: feature_name = col[4] if feature_name not in bed_dict: bed_dict[feature_name] = [] bed_dict[feature_name].append((col[0], int(col[1]), int(col[2]))) with open(args.outfile, 'w') as outfile: print('Population', 'Chromosome', 'Feature_name', 'Feature_type', 'Sites', 'S', 'thetaW', 'pi', 'tajd', sep='\t',
# Compute confusion matrix import matplotlib.pyplot as plt from sklearn.cross_validation import train_test_split from sklearn.metrics import confusion_matrix # Standardize features by removing the mean and scaling to unit variance from sklearn import preprocessing from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score, StratifiedKFold # Grid Search Random Forest parameters from sklearn.grid_search import GridSearchCV from sklearn.metrics import accuracy_score min_region_size = 1000 genomeDirectory = './genome/' dataDirectory = './train_data/' genome = FastaFile("./genome/GRCh38.genome.fa") dataFiles = ['E114.bed', 'E116.bed', 'E117.bed', 'E118.bed', 'E119.bed']#, 'E120.bed', 'E121.bed', 'E122.bed', 'E123.bed', 'E124.bed', 'E126.bed', 'E127.bed', 'E128.bed', 'E129.bed'] c = 0 regions = [] labels = [] def iter_peaks_and_labels(fname): with open(fname) as fp: for line in fp: data = line.split() yield (data[0], int(data[1]), int(data[2])), data[3] # returns region and its label: ('chrY', 20575266, 20576266), 'promoter'/'enhancer' return def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title)
def __init__(self, filename): self._fh = FastaFile(filename)
def fasta_transform(fasta_file, chain_file, locations, output_file, bgzip=False, reverse=False): """ :param fasta_file: :param chain_file: :param locations: :param output_file: :param bgzip: :param reverse: :return: """ start = time.time() if not isinstance(fasta_file, FastaFile): fasta_file = g2g_fu.check_file(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = g2g_fu.check_file(chain_file) output_file = g2g_fu.check_file(output_file, 'w') g2g_fu.delete_file(output_file) g2g_fu.delete_index_files(output_file) LOG.info("FASTA FILE: {0}".format(fasta_file)) LOG.info("CHAIN FILE: {0}".format(chain_file)) LOG.info("OUTPUT FILE: {0}".format(output_file)) LOG.info("BGZIP: {0}".format(bgzip)) LOG.info("REVERSE: {0}".format(reverse)) if isinstance(fasta_file, FastaFile): fasta = fasta_file else: fasta = FastaFile(fasta_file) if not isinstance(chain_file, ChainIter): chain_file = ChainIter(chain_file, reverse=reverse) seq_ids = [] if locations: LOG.debug("Have locations") new_locations = [] for l in locations: if isinstance(l, Location): new_locations.append(l) else: new_locations.append(parse_location(l)) seq_ids.append(new_locations[-1].seqid) locations = new_locations else: LOG.debug("Calculating locations") locations = [parse_location("{0}:1-{1}".format(a, fasta.get_reference_length(a)), 1) for a in fasta.references] seq_ids = [a for a in fasta.references] temp_output_file = output_file if bgzip: if g2g_fu.get_extension(output_file) != 'gz': output_file = "{0}.gz".format(output_file) else: temp_output_file = temp_output_file[:-3] fasta_out = open(temp_output_file, "w") LOG.info("Transforming...") chr_info = {} try: # will need a better way, but for now... LOG.info("Parsing chain file...") for line in chain_file: if len(line) > 7: LOG.debug("Adding chromosome {0}".format(chain_file.current_chain_header[1])) chr_info[chain_file.current_chain_header[1]] = {'from_size': line[2], 'from_start': line[4], 'from_end': line[5], 'to_size': line[7], 'to_start': line[9], 'to_end': line[10], 'header_chain':chain_file.current_chain_header, 'lines': []} else: chr_info[chain_file.current_chain_header[1]]['lines'].append(line) LOG.info("Chain file parsed") insertion_bases = 0 deletion_bases = 0 for location in locations: LOG.info("Processing chromosome={0}".format(location.seqid)) LOG.debug("Location: {0}".format(location)) chrom_size_from = chr_info[location.seqid]['from_size'] chrom_size_to = chr_info[location.seqid]['to_size'] last_pos = chr_info[location.seqid]['from_start'] new_sequence = StringIO() chain_file.reset() for chain_line in chr_info[location.seqid]['lines']: LOG.debug("\nLINE: {0} : {1}".format(chain_file.line_no, chain_line)) if len(chain_line) == 1: # last line fragment = chain_line[0] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(str(partial_seq)) if len(new_sequence.getvalue()) < chrom_size_to: LOG.warn("Length's do not match, chromosome length in chain: {0}, sequence length: {1}".format(chrom_size_to, len(new_sequence.getvalue()))) fasta_out.write(">{0} {1}:{2}-{3}\n".format(location.seqid, location.seqid, chr_info[location.seqid]['from_start'] + 1, chrom_size_to)) for l in wrap_sequence(new_sequence.getvalue()): fasta_out.write(l.strip()) fasta_out.write('\n') break else: # fragment_size dt_size dq_size same_bases dt_bases dq_bases fragment = chain_line[0] dt = chain_line[1 if not reverse else 2] dq = chain_line[2 if not reverse else 1] same = chain_line[3] dt_bases = chain_line[4 if not reverse else 5] dq_bases = chain_line[5 if not reverse else 4] partial_seq = fasta.fetch(location.seqid, last_pos, last_pos + fragment) new_sequence.write(partial_seq) if dq > 0: # insertion LOG.debug("INSERTION") new_sequence.write(dq_bases) LOG.debug("{0}:{1}-{2} (Length: {3})".format(location.seqid, last_pos, last_pos + fragment, len(partial_seq))) if len(partial_seq) > 100: LOG.debug("{0}...{1}".format(partial_seq[:10], partial_seq[-10:])) else: LOG.debug(partial_seq) LOG.debug("Adding {0}".format(dq_bases)) LOG.debug("SAME={0}, {1}".format(same, partial_seq[-(len(same)):])) insertion_bases += dq if dt > 0: # deletion LOG.debug("DELETION") last_pos += dt LOG.debug("skipping ahead {0} bases".format(dt)) deletion_bases += dt last_pos += fragment LOG.debug("LAST_POS={0}, INSERTIONS={1}, DELETIONS={2}, DIFF={3}".format(last_pos, insertion_bases, deletion_bases, (insertion_bases - deletion_bases))) # bgzip and index if bgzip: LOG.info("Compressing and indexing...") g2g_fu.bgzip_index(temp_output_file, output_file, 'fa') except G2GLocationError, le: LOG.debug("Unable to parse location, {0}".format(le.message)) raise le
def test_vutil_get_sequence_context(): fa = FastaFile('tests/data/ex1.fa') vcf_readers = vcreader(['tests/data/ex1.vcf.gz'], 'options') varlist = vcf_readers.variants('chr2') vutil.get_sequence_context(fa.fetch('chr2'), varlist[0])