def test_reverse_insertion(self): ''' check that reverse_indel works correctly for insertions ''' genome = Fasta(self.fa) var = self.Var(pos=11, chrom='N', ref='G', alts=['GAA']) rev = reverse_indel(var, genome) self.assertEqual(rev.pos, 10) self.assertEqual(rev.ref, 'G') self.assertEqual(rev.alts, ['GTT']) genome.close()
def test_reverse_deletion(self): ''' check that reverse_indel works correctly for deletions ''' genome = Fasta(self.fa) var = self.Var(pos=10, chrom='N', ref='CTA', alts=['C']) rev = reverse_indel(var, genome) self.assertEqual(rev.pos, 7) self.assertEqual(rev.ref, 'CTA') self.assertEqual(rev.alts, ['C']) genome.close()
def INDEX_GENOME(OUTDIR, GENOME_FILE): LOGGER.info('Indexing the genome') GENOMEIDX = Fasta(GENOME_FILE) GENOMEPREFIX = os.path.splitext(GENOME_FILE)[0] FAIDX = pd.read_csv(GENOME_FILE + '.fai', sep='\t', names=['SCAFFOLD', 'SCAFF_LENGTH', 'three', 'four', 'five']) #FAIDX = FAIDX[['SCAFFOLD', 'SCAFF_LENGTH']] FILE = GENOMEPREFIX + '.fai' INDEX = os.path.join(OUTDIR, FILE) FAIDX.to_csv(INDEX, sep='\t', header=False, index=False) return INDEX
def parse_fasta(self, file_name: str) -> None: tf = tempfile.NamedTemporaryFile() if self.naked: tf.seek(0) tf.write(bytes(file_name, 'utf-8')) tf.flush() file_name = tf.name self.store = Fasta(file_name) self.ids = self.store.keys() tf.close()
def extract_chromosome_data(chromosome_key, start, end): data_path = path_for_chromosome_data(chromosome_key) if not data_path.exists(): raise Exception("Chromosome data not downloaded") all_data = Fasta(str(data_path)) sliced_data = all_data[chromosome_key][start:end].seq return sliced_data
def gc_correct(input, output, reference, frac_n, frac_r, iter, frac_lowess): fasta = Fasta(reference) bed_lines = [ BedLine(*map(attempt_numeric, x.split("\t"))) for x in open(input) ] corrected = correct(bed_lines, fasta, frac_n, frac_r, iter, frac_lowess) with open(output, "wb") as ohandle: for line in corrected: ohandle.write(bytes(str(line) + "\n", 'utf-8'))
def get_prot_lens(faa_file, phage): len_dict = {} digits = get_digits(faa_file) #def make_seq_len_dict(faa): f = Fasta(faa_file) for i in f.keys(): name = get_locus_tag(i, digits=digits, phage=phage) length = len(str(f[i])) len_dict[name] = length return len_dict
def post(self): gene_ids = request.get_json(force=True)['gene_ids'] edit = request.get_json(force=True)['edit'] genome = request.get_json(force=True)['genome'] if not gene_ids: # TODO improve raise BadRequest('gene_ids not set') if genome not in ['hg19', 'mm10']: # raise BadRequest(f'{genome} not supported') if edit and len(gene_ids) != 1: raise BadRequest('gene_ids needs to have length 1 if editing..') # TODO here goes all the computation for checking wether SNP and CNSD # influence the guides. For now return the 6 best guides aggregation_pipeline = [ # filter our genes { '$match': { '$and': [{ 'gene_id': { '$in': gene_ids } }, { 'genome': genome }] } }, # unwind guides so we can access their score # {'$unwind': '$guides'}, # # sort by score # {'$sort': {'guides.score': -1}}, # # group guides together again (contrary of unwind) # {'$group': { # '_id': '$_id', # 'gene_id': {'$first': '$gene_id'}, # 'chromosome': {'$first': '$chromosome'}, # 'pdbs': {'$first': '$pdbs'}, # 'exons': {'$first': '$exons'}, # 'guides': {'$push': '$guides'} # }}, ] result = list(guide_collection.aggregate(aggregation_pipeline)) if edit: df = gencode_exons(genome) exons = df[(df.gene_id == gene_ids[0])] chromosome = exons.seqname.iloc[0] # TODO here i have to change things.. fasta = Fasta(GENOME_FILE.format(GENOME), as_raw=True) seq = fasta[chromosome][min(exons.start):max(exons.end)] # if self.strand == '-': # i think this is done on the client... # seq = seq.reverse.complement result[0]['sequence'] = seq return result
def raw_error_rate(fig_fn): n = 0 tmp_out = os.path.dirname(os.path.abspath(fig_fn)) + '/raw_cons_error.out' for sample, read_fn, ref_fn, info_fn, cons_ep_fn in zip(samples, read_fas, ref_fns, cons_info_fns, cons_ep_fn): read_fa = Fasta(read_fn) ref_fa = Fasta(ref_fn) with open(ref_fn) as ref_fp, open(cons_ep_fn) as cons_ep_fp, open(info_fn) as info_fp, open(tmp_out, 'w') as out_fp: out_fp.write('Sample\tCopyNum\tRawError\tConsError\n') last_name = '' for cons_name in ref_fa.keys(): read_name = cons_name.rsplit('_')[0] if read_name == last_name: continue copy_num, raw_error, cons_error = 0, 0, 0 ref_seq = ref_fa[cons_name][:].seq.upper() read_seq = read_fa[read_name][:].seq.upper() raw_error = get_mp_error_rate(ref_seq, read_seq) if raw_error < 0: continue for eline in cons_ep_fp: if eline.startswith('#'): continue ele = iline.rsplit() name, error = ele[ep_idx['#READ_NAME']], ele[ep_idx['ERR_RATE']][:-1]/100.0 if name == cons_name: cons_error = error else: continue for sline in info_fp: ele = sline.rsplit() name, num = ele[info_idx['CONS_NAME']], ele[info_idx['COPY_NUM']] if name == cons_name: copy_num = int(num) else: continue out_fp.write('{}\t{}\t{}\t{}\n'.format(sample, copy_num, raw_error, cons_error)) last_name = read_name n+=1 if n== 10: sys.exit(1) cmd = 'Rscript /home/gaoy1/program/circ_plot/error_rate.R {} {}'.format(ep_fn, fig_fn) print(cmd)
def __init__(self, ref_fasta_fn): """""" # Init dict with chromosomes names self.sites = OrderedDict() with Fasta(ref_fasta_fn) as fa: for ref in fa: self.sites[ref.name] = OrderedDict() # Init other self variables self.counter = Counter()
def __init__(self, reference, annot_file, desc): """ Usage: PrimerDesign(reference, annotation, description) Initialise a design object witha reference assembly and annotation file(s) """ self.reference = Fasta(reference) self.annotations = BedTool(annot_file) self.desc = desc self.genome = re.sub("fasta$", "fasta.fai", re.sub("fa$", "fa.fai", self.reference.filename))
def regex_filer(_fname, _regex, _v): infa = _fname + "_to_regex" os.rename(_fname, infa) # filter the fasta and store the output's keys keys_out = filter_fasta(infa, outfa=_fname, regex=_regex, v=_v, force=True).keys() keys_in = Fasta(infa).keys() return [k for k in keys_in if k not in keys_out]
def fasta_extract_regions(fa_fname, intervals): """Extract an iterable of regions from an indexed FASTA file. Input: FASTA file name; iterable of (seq_id, start, end) (1-based) Output: iterable of string sequences. """ with Fasta(fa_fname, as_raw=True) as fa_file: for chrom, subarr in intervals.by_chromosome(): logging.info("Extracting sequences from chromosome %s", chrom) for _chrom, start, end in subarr.coords(): yield fa_file[_chrom][start.item():end.item()]
def test_revcomp_whole_entry(self): fasta = Fasta('data/genes.fasta') if test_bio: with open('data/genes.fasta', "rU") as fh: seqio = SeqIO.to_dict(SeqIO.parse(fh, "fasta")) assert str( fasta['gi|557361099|gb|KF435150.1|'][:].reverse.complement ) == str( seqio['gi|557361099|gb|KF435150.1|'].reverse_complement().seq) else: raise SkipTest
def fasta(fasta_file): """Load organism fasta file for use in pyfaidx module Args: fasta_file = the full filepath, including the file itself, to the organism's fasta file Note: an index of the fasta file should also be present in the same directory. This can be produced using samtools faidx command and will have the suffix .fai """ org = Fasta(fasta_file) return ('%s accessed' % fasta_file), org
def split_target_sequence(target_chroms, target_fasta_name, inter_files): Faidx(target_fasta_name) genome_size =0 target_fasta = Fasta(target_fasta_name, key_function = lambda x: x.split()[0]) for value in target_fasta.values(): genome_size += len(value) for chrm in target_chroms: if chrm != target_fasta_name: out=open( inter_files + "/" + chrm+".fa", 'w') out.write(">" + chrm + "\n" + str(target_fasta[chrm])) return genome_size
def get_transcripts(reference_file, transcript_file, vcf_file): """Take a FASTA reference file and a VCF file, and generate a FASTA file with changes from the vcf file""" shutil.copyfile(reference_file, transcript_file) transcripts = Fasta(transcript_file, mutable=True) with open(vcf_file) as f: for (accession, pos, ref, alt) in get_variations(f): if accession not in transcripts: raise ValueError('VCF accession {0} not found in reference'.\ format(accession)) transcripts[accession][(pos - 1):pos] = alt
def test_fetch_whole_entry(self): fasta = Fasta('data/genes.fasta') if test_bio: with open('data/genes.fasta', "rU") as fh: seqio = SeqIO.to_dict(SeqIO.parse(fh, "fasta")) assert str(fasta['gi|557361099|gb|KF435150.1|']) == str( seqio['gi|557361099|gb|KF435150.1|'].seq) assert fasta['gi|557361099|gb|KF435150.1|'].name == str( seqio['gi|557361099|gb|KF435150.1|'].name) else: raise SkipTest
def processMAF(args, subtypes_dict): fasta_reader = Fasta(args.fastafile, read_ahead=1000000) nbp = (args.length-1)//2 samples_dict = {} # M = np.zeros((len(samples), len(subtypes_dict))) numsites_keep = 0 numsites_skip = 0 chrseq = '0' f = open(args.input, 'r', encoding = "ISO-8859-1") reader = csv.DictReader(filter(lambda row: row[0]!='#', f), delimiter='\t') counter = 0 for row in reader: if(row['Variant_Type'] != "SNP"): continue pos = int(row['Start_position']) ref = row['Reference_Allele'] alt = row['Tumor_Seq_Allele2'] sample = row[args.groupvar] if row['Chromosome'] != chrseq: sequence = fasta_reader[row['Chromosome']] chrseq = row['Chromosome'] counter += 1 mu_type = ref + alt category = getCategory(mu_type) lseq = sequence[pos-(nbp+1):pos+nbp].seq motif_a = getMotif(pos, lseq) subtype = str(category + "." + motif_a) st = subtypes_dict[subtype] if sample not in samples_dict: samples_dict[sample] = {} if subtype not in samples_dict[sample]: samples_dict[sample][subtype] = 1 else: samples_dict[sample][subtype] += 1 if (counter%1000 != 0): continue util_log.debug(args.input + ": " + str(counter) + " sites counted") M = DataFrame(samples_dict).T.fillna(0).values samples = sorted(samples_dict) out = collections.namedtuple('Out', ['M', 'samples'])(M, samples) return out
def read_pep_fa(protein_file): import pandas as pd proteins = Fasta(str(protein_file)) pl = [] for v in proteins: names = v.long_name.split(" ", 8) d = {"protein_id": names[0], 'protein_type': names[1]} d = {**d, **dict([n.split(":", 1) for n in names[2:]])} d['seq'] = str(proteins[v.name]) pl.append(d) return pd.DataFrame(pl)
def generate_fasta(intersection_bedtool, fasta_filename, revcomp, verbose): if verbose: print >> sys.stderr, ">> generating fasta of positions ..." # -s: force strandedness fasta_seqs = intersection_bedtool.sequence(fi=fasta_filename, s=True) fasta = Fasta(fasta_seqs.seqfn) return fasta
def fasta_extract_regions(fa_fname, intervals): """Extract an iterable of regions from an indexed FASTA file. Input: FASTA file name; iterable of (seq_id, start, end) (1-based) Output: iterable of string sequences. """ with Fasta(fa_fname, as_raw=True) as fa_file: for chrom, rows in groupby(intervals, lambda cse: cse[0]): logging.info("Extracting sequences from chromosome %s", chrom) for _chrom, start, end in rows: yield fa_file[_chrom][start:end]
def set_peak_sequences_using_fasta(self, fasta_file_location="grch38.fasta"): logging.info("Setting peak sequences using fasta index") genome = Fasta(fasta_file_location) i = 0 for peak in self.peaks: if i % 10000 == 0: logging.info("%d/%d peaks processed" % (i, len(self.peaks))) i += 1 peak.set_sequence_using_fasta_index(genome)
def __select_ref(self, ref_reads, min_coverage, min_ref_length, downsample_high_coverage): """Select ref_id with a minimal coverage in both sample + downsample if needed""" valid_ref_reads = OrderedDict() c = Counter() with Fasta(self._fasta_fn) as fasta: for ref_id, ref_dict in ref_reads.items(): try: # Discard reference transcripts shorter than the threshold assert len(fasta[ref_id]) > min_ref_length valid_dict = OrderedDict() for cond_lab, cond_dict in ref_dict.items(): valid_dict[cond_lab] = OrderedDict() for sample_lab, read_list in cond_dict.items(): logger.trace( f"Asserting if {ref_id} has enough coverage in {sample_lab}" ) # Filter out if coverage too low assert len(read_list) >= min_coverage logger.trace( f"ref_id {ref_id} has {len(read_list)} reads in {sample_lab}" ) # Downsample if coverage too high if downsample_high_coverage and len( read_list) > downsample_high_coverage: read_list = random.sample( read_list, downsample_high_coverage) valid_dict[cond_lab][sample_lab] = read_list # If all valid add to new dict logger.trace( f"ref_id {ref_id} has enough coverage in all samples: keeping it" ) valid_ref_reads[ref_id] = valid_dict # Save extra info for debug c["valid_ref_id"] += 1 for cond_lab, cond_dict in valid_dict.items(): for sample_lab, read_list in cond_dict.items(): lab = "{} {} Reads".format(cond_lab, sample_lab) c[lab] += len(read_list) except AssertionError: logger.trace( f"ref_id {ref_id} does not have enough coverage in at least one sample: discarding it" ) c["invalid_ref_id"] += 1 logger.debug(counter_to_str(c)) logger.info( "\tReferences remaining after reference coverage filtering: {}". format(len(valid_ref_reads))) return valid_ref_reads
def __init__(self, ref_fasta_path, vcf_path, kmer_size, nprocs): self.vcf_path = vcf_path self.fasta_path = ref_fasta_path self.ref = Fasta(ref_fasta_path) self.vcf = VCF(vcf_path) self.kmer_size = kmer_size self.nprocs = nprocs self.keys = [c for c in self.vcf.seqnames if c in self.ref.keys()] self.directory = None if len(self.keys) == 0: self.keys = self.ref.keys() print('No common keys found. Using reference.')
def faabed(faa, output): """ create a fake bed file to keep backwards compatibility """ fa = Fasta(faa) chrom = 1 with open(output, "w") as fbed: for seq in fa: s = "chrom_{}\t1\t{}\t.\t{}\n".format(chrom, len(seq), seq.name) fbed.write(s) chrom += 1 return output
def load_fasta_sequences(fasta_file, return_keys=False): """ Reads a FASTA file and returns list of string sequences """ fasta = Fasta(fasta_file, as_raw=True, sequence_always_upper=True) seqs = [seq[:] for seq in fasta] if return_keys: keys = list(fasta.keys()) fasta.close() if return_keys: return seqs, keys return seqs
def __get_kmer_list(self, ref_id, start, end, kmer_size=5): """ Extract fasta record corresponding to ref with error handling """ try: with Fasta(self._fasta_fn) as fasta: fasta =(fasta [ref_id]) seq = str(fasta[start:end+5]) kmer_list = [] for i in range(end-start): kmer_list.append(seq[i:i+5]) return kmer_list except KeyError: raise NanocomporeError("Reference id not present in fasta file")
def __init__(self, speciesNumber, genomeFileList, gffFileList, speciesName, speciesShortName): self.speciesNumber = speciesNumber for file in genomeFileList: if self.speciesNumber in file: self.genome = Fasta(file) for file in gffFileList: if self.speciesNumber in file and 'PAC' in file: self.gffFile = file self.speciesName = speciesName self.speciesShortName = speciesShortName self.conservedElementsBed = '%s_ConservedElements.bed' % self.speciesName
def main(): """ read fasta of 26bp seq of 3' pacbio transcript """ fasta = Fasta("../data/pacbio/pacbio_new_gene_model.bam.down26.fasta", duplicate_action="longest") for name in fasta.keys(): seq = str(fasta[name]) m = re.search('^(' + "A" + '+)', seq) if m: p = str(m.group(1)) print(p + "\t" + str(len(p))) else: print("N" + "\t" + str(0))