def compute_mes(interval, matrix5, matrix3, genome): genome = pysam.FastaFile(genome) # 5ss 3bases in exon and 6 bases in intron # 3ss 20 bases in intron and 3 bases in exon if interval['strand'] == '+': seq5 = genome.fetch(interval.chrom, interval.end - 3, interval.end + 6).upper() seq3 = genome.fetch(interval.chrom, interval.start - 20, interval.start + 3).upper() else: seq5 = reverse_complement( genome.fetch(interval.chrom, interval.start - 6, interval.start + 3).upper()) seq3 = reverse_complement( genome.fetch(interval.chrom, interval.end - 3, interval.end + 20).upper()) name_format_str = '{seq5}:{mes5}|{seq3}:{mes3}' if set(seq5).issubset('ACGT') and set(seq3).issubset('ACGT'): mes5 = maxent_fast.score5(seq5, matrix=matrix5) mes3 = maxent_fast.score3(seq3, matrix=matrix3) interval['seq5'] = seq5 interval['mes5'] = mes5 interval['seq3'] = seq3 interval['mes3'] = mes3 else: interval['seq5'] = seq5 interval['mes5'] = 'NA' interval['seq3'] = seq3 interval['mes3'] = 'NA' return interval
def calculate_gc_percentage(interval, genome, extend=0): chrom = interval['chrom'] start = int(interval['start']) end = int(interval['end']) strand = interval['strand'] genome = pysam.FastaFile(genome) if strand == '+' or strand == '.': mid_region_seq = genome.fetch(chrom, start, end).upper() interval['GC_exon'] = seq_gc_content(mid_region_seq) if extend != 0: up_intron_seq = genome.fetch(chrom, start - extend, start).upper() dn_intron_seq = genome.fetch(chrom, end, end + extend).upper() interval['GC_up_intron'] = seq_gc_content(up_intron_seq) interval['GC_dn_intron'] = seq_gc_content(dn_intron_seq) else: mid_region_seq = reverse_complement( genome.fetch(chrom, start, end).upper()) interval['GC_exon'] = seq_gc_content(mid_region_seq) if extend != 0: up_intron_seq = reverse_complement( genome.fetch(chrom, end, end + extend).upper()) dn_intron_seq = reverse_complement( genome.fetch(chrom, start - extend, start).upper()) interval['GC_up_intron'] = seq_gc_content(up_intron_seq) interval['GC_dn_intron'] = seq_gc_content(dn_intron_seq) return interval
def bind_chroseq(self, refdic, gap=0, intron=False): """ the gap=0 and intron=False will output CDS seq gap>0 would not work with intron=True grp>0 and intron=False woll output exon seq seperated by "N" :param: refdic: the reference genome """ # need self.exon, self.strand # need to note that the chr_select is 0 based # while the exon in bigg is 1 based if self.exon is None: self.get_exon() seq_l = [] for n, exon_one in enumerate(self.exon): chro = self.chrom start, end = exon_one _, seq = chr_select(refdic, chro, start, end) seq_l.append(seq.upper()) # upper case for exon if intron is False and n < len(self.exon) - 1: seq_l.append(gap * "N") if intron and n < len(self.intron): intron_one = self.intron[n] start_i, end_i = intron_one _, seq_intron = chr_select(refdic, chro, start_i, end_i) seq_l.append(seq_intron.lower()) # lower case for intron seq_raw = "".join(seq_l) if self.strand == "+": seq_out = seq_raw else: seq_out = reverse_complement(seq_raw) self.seq_chro = "".join(seq_out)
def find_reverse_palindromes(sequence): """Find every reverse palindromes of size >= 4, <= 12 and return their indices and associated length within a sequence. Args: sequence: string, dna sequence Returns: list of starting indices and length of each reverse palindrome found """ output = [] for i in range(len(sequence)): for j in range(4, 13): if i + j > len(sequence): continue subseq = sequence[i:i + j] rc = reverse_complement(subseq) if subseq == rc: output.append((i + 1, j)) return output
def compute_possible_dna_origins(DNA, final_peptides): """ Given a string of DNA and a string of peptides, find the subsets of DNA withing the String that could have encoded the peptide (Encoding process: DNA -> RNA -> peptide) NOTE: for each String of DNA, we have to get the reverse complement that we know will be attached to it during transcription into RNA :param DNA: String - The strand of DNA :param peptides: String - The peptide produced after transcription and translation :return: origins: Array - A list of possible DNA origins for the peptides """ encoders = recursive_find_rna_encoders(set(), final_peptides) freq_dict = FrequencyDict(DNA, len(final_peptides) * 3) res = [] for codon in encoders: enc_dna = rna_to_dna(codon) enc_rev = reverse_complement(enc_dna, as_string=True) freq = freq_dict.get(enc_dna, 0) res.extend([enc_dna] * freq) freq = freq_dict.get(enc_rev, 0) res.extend([enc_rev] * freq) return res
def most_frequent_kmers(DNA, k, mutation_thresh=0, reverse=False): """ Returns a list of most frequent k-mers in DNA We'll use a Priorty Queue to track each pattern along with its frequency in the DNA. :param DNA: String - DNA :param k: Integer - Length of the K-mer :param mutation_thresh: Allows for a certain number of mismatches :return: Set - Set of most frequent K-mers """ freq_dict = dictionaries.FrequencyDict(DNA, k, mutation_thresh) kmers_found = set() current_highest_freq = 0 for kmer, frequency in freq_dict.items(): rev = reverse_complement(kmer, as_string=True) if reverse and rev in freq_dict: frequency += freq_dict[rev] if frequency > current_highest_freq: current_highest_freq = frequency kmers_found = set([kmer]) elif frequency == current_highest_freq: kmers_found.add(kmer) return kmers_found
def _test_profile_split(self, sequences, length): counts = utils.counts(sequences, length) profile = klib.Profile(utils.as_array(counts, length)) left, right = profile.split() assert len(left) == len(right) assert sum(left) + sum(right) == sum(counts.values()) * 2 indices_left = {} indices_right = {} indices_palindrome = {} for s, c in counts.items(): r = utils.reverse_complement(s) if s < r: indices_left[utils.count_index(s)] = c * 2 elif s > r: indices_right[utils.count_index(r)] = counts[s] * 2 else: indices_palindrome[utils.count_index(s)] = c assert ([c for c in left if c > 0] == [ c for i, c in sorted( list(indices_left.items()) + list(indices_palindrome.items())) ]) assert ([c for c in right if c > 0] == [ c for i, c in sorted( list(indices_right.items()) + list(indices_palindrome.items())) ])
def main(input_str): """ Main function takes string input and returns the best results depending on scoring. Single result include sh-miR sequence, score and link to 2D structure from mfold program """ sequence = check_input(input_str) seq1, seq2, shift_left, shift_right = sequence if not seq2: seq2 = reverse_complement(seq1) all_frames = get_all() if 'error' in all_frames: #database error handler return all_frames frames = get_frames(seq1, seq2, shift_left, shift_right, all_frames) original_frames = [Backbone(**elem) for elem in all_frames] frames_with_score = [] for frame_tuple, original in zip(frames, original_frames): score = 0 frame, insert1, insert2 = frame_tuple mfold_data = mfold(frame.template(insert1, insert2)) if 'error' in mfold_data: return mfold_data pdf, ss = mfold_data[0], mfold_data[1] score += score_frame(frame_tuple, ss, original) score += score_homogeneity(original) score += two_same_strands_score(seq1, original) frames_with_score.append((score, frame.template(insert1, insert2), frame.name, pdf)) sorted_frames = [elem for elem in sorted(frames_with_score,\ key=lambda x: x[0], reverse=True) if elem[0] > 60] return {'result': sorted_frames[:3]}
def main(fasta_block): fasta_dict = fasta_breakup(fasta_block) dna = fasta_dict.values()[0] restriction_sites = [] for block_len in range(4,14,2): idx = 0 while idx <= len(dna) - block_len: current = dna[0+idx: block_len+idx] # split current in half if current[:(block_len/2)] == "".join(reverse_complement(current[(block_len/2):])): print "palindrome, yo", current, current[:(block_len/2)], "".join(reverse_complement(current[(block_len/2):])) restriction_sites.append([idx+1, block_len]) idx += 1 print restriction_sites return restriction_sites
def d_duval_(seq, alg, **kwargs): factors1 = [len(i) for i in alg(seq, **kwargs)] complement = reverse_complement(seq) factors2 = [len(i) for i in reversed(alg(complement, **kwargs))] rest = seq; result = [] while factors1 and factors2: if factors1[0] < factors2[0]: n = factors1.pop(0) factors2[0] = factors2[0] - n if factors2[0] == 0: factors2.pop(0) else: n = factors2.pop(0) factors1[0] = factors1[0] - n if factors1[0] == 0: factors1.pop(0) f, rest = rest[:n], rest[n:] result.append(f) while factors1: n = factors1.pop(0) f, rest = rest[:n], rest[n:] result.append(f) while factors2: n = factors2.pop(0) f, rest = rest[:n], rest[n:] result.append(f) return result
def test_profile_reverse_complement_palindrome(self): counts = utils.counts(['ACCTAGGT'], 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna(profile.reverse_complement(i)) == utils.reverse_complement(profile.binary_to_dna(i)))
def get_adenylation_domains(fasta, known=None, lagging_strand=False): adenylation_domains = [] fasta_seqs = [] for fs in SeqIO.parse(fasta, 'fasta'): revcom=False seq = str(fs.seq) pepseq, rf = get_pepseq(seq) if rf < 0 == lagging_strand: revcom=True seq = utils.reverse_complement(seq) fasta_seqs.append({'id': fs.id, 'seq': seq, 'pepseq': pepseq, 'rf': rf}) for fs in fasta_seqs: utils.run_cmd([hmmsearch, '--domtblout', 'dump', os.path.abspath('lib/AMP-binding.hmm'), '-'], '>header\n' + pepseq) with open('dump') as f: out = f.read() res_stream = StringIO(out) os.remove('dump') results = list(SearchIO.parse(res_stream, 'hmmsearch3-domtab')) for result in results: for i, hsp in enumerate(result.hsps, 1): s = hsp.hit_start e = hsp.hit_end adenylation_domains.append((AdenylationDomain(fs['seq'][s*3:e*3], known, '{}_{}'.format(fs['id'], i), revcom), s, e)) return adenylation_domains
def test_profile_reverse_complement(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna(profile.reverse_complement(i)) == utils.reverse_complement(profile.binary_to_dna(i)))
def _test_profile_split(self, sequences, length): counts = utils.counts(sequences, length) profile = klib.Profile(utils.as_array(counts, length)) left, right = profile.split() assert len(left) == len(right) assert sum(left) + sum(right) == sum(counts.values()) * 2 indices_left = {} indices_right = {} indices_palindrome = {} for s, c in counts.items(): r = utils.reverse_complement(s) if s < r: indices_left[utils.count_index(s)] = c * 2 elif s > r: indices_right[utils.count_index(r)] = counts[s] * 2 else: indices_palindrome[utils.count_index(s)] = c assert ([c for c in left if c > 0] == [c for i, c in sorted(list(indices_left.items()) + list(indices_palindrome.items()))]) assert ([c for c in right if c > 0] == [c for i, c in sorted(list(indices_right.items()) + list(indices_palindrome.items()))])
def main(input_str): """ Main function takes string input and returns the best results depending on scoring. Single result include sh-miR sequence, score and link to 2D structure from mfold program """ sequence = check_input(input_str) seq1, seq2, shift_left, shift_right = sequence if not seq2: seq2 = reverse_complement(seq1) all_frames = get_all() if 'error' in all_frames: #database error handler return all_frames frames = get_frames(seq1, seq2, shift_left, shift_right, all_frames) original_frames = [Backbone(**elem) for elem in all_frames] frames_with_score = [] for frame_tuple, original in zip(frames, original_frames): score = 0 frame, insert1, insert2 = frame_tuple mfold_data = mfold(frame.template(insert1, insert2)) if 'error' in mfold_data: return mfold_data pdf, ss = mfold_data[0], mfold_data[1] score += score_frame(frame_tuple, ss, original) score += score_homogeneity(original) score += two_same_strands_score(seq1, original) frames_with_score.append( (score, frame.template(insert1, insert2), frame.name, pdf)) sorted_frames = [elem for elem in sorted(frames_with_score,\ key=lambda x: x[0], reverse=True) if elem[0] > 60] return {'result': sorted_frames[:3]}
def test_profile_reverse_complement_palindrome(self): counts = utils.counts(['ACCTAGGT'], 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna( profile.reverse_complement(i)) == utils.reverse_complement( profile.binary_to_dna(i)))
def test_profile_balance_palindrome(self): counts = utils.counts(['AATT'], 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.balance() counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 4)
def test_profile_balance(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) profile.balance() counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 8)
def test_profile_balance_palindrome(self): counts = utils.counts(['AATT'], 4) profile = klib.Profile(utils.as_array(counts, 4)) profile.balance() counts.update( dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 4)
def test_profile_reverse_complement(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) for i in range(profile.length): assert (profile.binary_to_dna( profile.reverse_complement(i)) == utils.reverse_complement( profile.binary_to_dna(i)))
def test_profile_balance(self): counts = utils.counts(utils.SEQUENCES, 8) profile = klib.Profile(utils.as_array(counts, 8)) profile.balance() counts.update( dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile(profile, counts, 8)
def split_assembled_genome(gtf_path, fasta_path, od='.', L=98): trs = parse_gtf(gtf_path) print('GTF parsed') scaffolds = parse_fasta(fasta_path) print('Scaffolds parsed') wrong_scaffolds = 0 for tr, data in trs.items(): processed = [] unprocessed = [] if data['scaffold'] in scaffolds: sequence = scaffolds[data['scaffold']] else: print(f'{data["scaffold"]} not in FASTA file {fasta_path}') wrong_scaffolds += 1 continue for exon in data['exons']: processed.append((int(exon['start']) - 1, int(exon['end']) - 1)) unprocessed.append( (int(exon['start']) - L, int(exon['end']) + L - 2)) processed = list(merge_intervals(processed)) unprocessed = merge_intervals(unprocessed) processed = ''.join(map(lambda iv: sequence[iv[0]:iv[1]], processed)) splice_junctions = [] for iv in unprocessed: if iv[1] - iv[0] < 3 * L - 3: # If the length of the exon is < L-1 splice_junctions.append(iv) else: splice_junctions.append((iv[0], iv[0] + 2 * L - 2)) splice_junctions.append((iv[1] - (2 * L) + 2, iv[1])) splice_junctions = [ collapse_N(sequence[iv[0]:iv[1]].upper()) for iv in splice_junctions ] processed = collapse_N(processed).upper() if data['strand'] == '-': processed = reverse_complement(processed) splice_junctions = map(reverse_complement, splice_junctions) with open(f'{od}/processed_transcripts.fasta', 'a') as fh: fh.write(f'>{tr}\n') fh.write('\n'.join( [processed[i:i + 80] for i in range(0, len(processed), 80)])) fh.write('\n') with open(f'{od}/splice_junctions.fasta', 'a') as fh: for i, sj in enumerate(splice_junctions): fh.write(f'>{tr}:{i}\n') fh.write(f'{sj}\n') print('DONE!') print( f'{wrong_scaffolds} scaffolds were not found, and the corresponding annotations were ignored.' )
def get_sequences(self, locations, width=200): # need to ensure that most locations on the forward # and reverse strands are mappable seqs = [utils.makestr(self.genome[loc[0]][int(loc[1])-width/2:int(loc[1])+width/2]) if loc[3]=='+' \ else utils.reverse_complement(utils.makestr(self.genome[loc[0]][int(loc[2])-width/2+1:int(loc[2])+width/2+1])) \ for loc in locations] return seqs
def test_balance(self): counts = utils.counts(utils.SEQUENCES, 8) filename = self.empty() with utils.open_profile(self.profile(counts, 8)) as input_handle: with utils.open_profile(filename, 'w') as output_handle: kmer.balance(input_handle, output_handle) counts.update(dict((utils.reverse_complement(s), c) for s, c in counts.items())) utils.test_profile_file(filename, counts, 8)
def getSeq(self,g): seq = '' for node in self.nodeIds: nodeSeq = g.nodes[abs(node)].nodeSeq if node < 0: nodeSeq = reverse_complement(nodeSeq) if seq != '': nodeSeq = nodeSeq[g.overlap:] seq = seq + nodeSeq return(seq)
def frequent_words_with_mismatches_with_revcomp(text: str, k: int, d: int) -> Set[str]: """ >>> frequent_words_with_mismatches_with_revcomp("ACGTTGCATGTCGCATGATGCATGAGAGCT", 4, 1) == {'ATGT', 'ACAT'} True """ freq_map = _frequent_words_helper(text, k, d) freq_map_rc = _frequent_words_helper(reverse_complement(text), k, d) freq_map += freq_map_rc _, maxval = freq_map.most_common(1)[0] res = {k for k, v in freq_map.items() if v == maxval} return res
def find_wells(): mm = mismatch_reporters() ls = load_seqs() successes = {} for k,v in mm.iteritems(): if not 'fwd' in k: continue successes[k] = [ (k1,v1) for k1,v1 in ls.iteritems() if v[6:-1] in zutils.reverse_complement(v1)] print print print print ' '+'\n '.join(sorted([str((k,len([elt[0] for elt in v]),[elt[0] for elt in v])) for k,v in successes.iteritems()])) return successes
def get_key_factor(factors, normalize=False): if len(factors) > 3: longest = max(factors[1:-1], key=lambda factor: len(factor)) else: longest = max(factors, key=lambda factor: len(factor)) if normalize: reverse = reverse_complement(longest) if reverse < longest: longest = reverse if 10 < len(longest) < 20: longest = longest[:10] + longest[-(len(longest) - 10):] elif len(longest) > 20: longest = longest[:10] + longest[-10:] return longest
def main(fasta_block): dna_dict = fasta_breakup(fasta_block) dna_target = dna_dict.values()[0] rna_target = dna_to_rna(dna_target) rev_comp_dna = "".join(reverse_complement(dna_target)) rev_comp_rna = dna_to_rna(rev_comp_dna) protein_strings = [] for start in find_all(rna_target, 'AUG'): protein_strings.append(translate(start, rna_target)) for start in find_all(rev_comp_rna, 'AUG'): protein_strings.append(translate(start, rev_comp_rna)) #dedupe and remove Nones protein_strings = [prot for prot in protein_strings if prot != None] cleaned_proteins = list(set(protein_strings)) return "\n".join(cleaned_proteins)
def make_oligos(self): if not self.library: print 'Must have mutation library before making oligos.' return for aa in self.library: seq = self.nt_seq for s, e, c in [(res[1], res[2], dgn) for res, dgn, tf in zip(self.code, self.library[aa]['dgn'], self.library[aa]['cdns']) if tf]: # TODO: Codon usage.. c = ''.join([next(iter(utils.dgn_to_nts[nt])) for nt in c]) seq = seq[:s] + c + seq[e:] oligo_positions = [] mut_positions = [] self.oligos[aa] = [] for s in self.library[aa]['oligo_set']: mut_start = s[0]*3 mut_end = s[-1]*3+3 mut = seq[mut_start:mut_end] pre_len = (oligo_length - len(mut)) / 2 oligo_start = mut_start-pre_len oligo_end = oligo_start + oligo_length oligo_seq = self.nt_seq[oligo_start:mut_start] + mut.lower() + self.nt_seq[mut_end:oligo_end] if self.revcom: oligo_seq = utils.reverse_complement(oligo_seq) self.oligos[aa].append(oligo_seq) oligo_positions.append([oligo_start, oligo_end]) mut_positions.append([mut_start, mut_end]) for i, pos in enumerate(oligo_positions[1:]): if mut_positions[i][1] > pos[0]: print 'Oligo clash detected..' return
def mismatch_reporters(): enzymes = {'fwd': ['CTAGA', 'G'], 'rev': ['GATCC', 'T']} primers, names = [], [] for k, r in reporters.iteritems(): for m_ct in 0, 2, 3: if m_ct > 0: nts = [n for n in nt_list if n!=ids[k]] else: nts = [ids[k]] for mm in nts: rep = re.sub('N', mm ,r, m_ct) rep = re.sub('N', ids[k], rep) fwd = list(enzymes['fwd']) fwd.insert(1,rep) rev = list(enzymes['rev']) rev.insert(1,zutils.reverse_complement(rep)) names.append('{0}_{1}mm={2}_rev'.format(k, m_ct, mm)) primers.append(''.join(rev)) names.append('{0}_{1}mm={2}_fwd'.format(k, m_ct, mm)) primers.append(''.join(fwd)) return dict([(n,p) for n,p in zip(names, primers)])
def locate_breakpoint(input_file, output_file, reference_file, margin=200): class Breakpoint_locator(object): def __init__(self): self.consensus = None self.seq_around_bp = None self.seq_start = None self.seq_end = None self.seq_dir = None self.bp_pos_consensus = None self.bp_pos_reference = None self.tmp_dir = tempfile.mkdtemp() def __del__(self): shutil.rmtree(self.tmp_dir) def initialize(self, cluster_id, consensus, seq_around_bp, seq_start, seq_end, seq_dir): self.cluster_id = cluster_id self.consensus = consensus self.seq_around_bp = seq_around_bp self.seq_start = seq_start self.seq_end = seq_end self.seq_dir = seq_dir self.bp_pos_consensus = None self.bp_pos_reference = None if len(self.consensus) >= 1000: self.consensus = self.consensus[:1000] def locate_by_alignment(self): with open(self.tmp_dir + '/' + self.cluster_id + ".query.fa", 'w') as hout: print(">query_%s\n%s" % (self.cluster_id, self.seq_around_bp), file=hout) with open(self.tmp_dir + '/' + self.cluster_id + ".target.fa", 'w') as hout: print(">target_%s\n%s" % (self.cluster_id, self.consensus), file=hout) alignment_info = nanomonsv.long_read_validate.ssw_check( self.tmp_dir + '/' + self.cluster_id + ".target.fa", self.tmp_dir + '/' + self.cluster_id + ".query.fa") # print(self.seq_start, self.seq_end, self.seq_dir) # print(alignment_info["query_" + self.cluster_id]) if "query_" + self.cluster_id not in alignment_info: return _, tstart_a, tend_a, qstart_a, qend_a, strand_a = alignment_info[ "query_" + self.cluster_id] if strand_a != '+': return self.bp_pos_consensus = tend_a if self.seq_dir == '+': self.bp_pos_reference = self.seq_end - ( len(self.seq_around_bp) - qend_a) else: self.bp_pos_reference = self.seq_start + ( len(self.seq_around_bp) - qend_a) # print(self.bp_pos_reference, self.bp_pos_consensus) bp_loc = Breakpoint_locator() fasta_file = pysam.FastaFile(reference_file) with open(input_file, 'r') as hin, open(output_file, 'w') as hout: for row in csv.reader(hin, delimiter='\t'): if row[4] == '+': seq_around_bp = fasta_file.fetch(row[1], int(row[2]) - margin, int(row[3])) bp_loc.initialize(row[0], row[5], seq_around_bp, int(row[2]) - margin + 1, int(row[3]), '+') else: seq_around_bp = fasta_file.fetch(row[1], int(row[2]), int(row[3]) + margin) seq_around_bp = reverse_complement(seq_around_bp) bp_loc.initialize(row[0], row[5], seq_around_bp, int(row[2]) + 1, int(row[3]) + margin, '-') bp_loc.locate_by_alignment() if bp_loc.bp_pos_reference is not None: print("%s\t%s\t%d\t%s\t%s" % (row[0], row[1], bp_loc.bp_pos_reference, row[4], row[5][bp_loc.bp_pos_consensus:]), file=hout) del bp_loc fasta_file.close()
def get_node_seq(self, nodeId): if nodeId < 0: nodeSeq = reverse_complement(self.nodes[-nodeId].nodeSeq.strip()) else: nodeSeq = self.nodes[nodeId].nodeSeq.strip() return (nodeSeq)
def SNV_main(args, mut_df=None, frameshift_df=None): opts = vars(args) ############################ # read in necessary files ############################ # read in position to trinucleotide file logger.info('reading pos_to_nuc_dictionary...') # read in data frame pos_to_nuc_df = pd.read_table('db/merged_pos_to_context_class_final.txt', sep='\t', names=('pos', 'trinucleotide', 'coefficient')) if len(pos_to_nuc_df[ pos_to_nuc_df['trinucleotide'].astype(str).str.len() != 4]) != 0: logger.info('something is wrong with reading pos_to_nuc_dictionary...') sys.exit() # make dictionary pos_to_nuc = pos_to_nuc_df.set_index('pos')['trinucleotide'].to_dict() pos_to_nuc_keys = pos_to_nuc.keys() # read in trinucleotide to position file logger.info('reading nuc_to_pos_dictionary...') nuc_to_pos_dict = {} nuc_to_cumsum_dict = {} for k in strand_trinucs: df = pd.read_table('db/' + k + '_data.txt', sep='\t', names=('pos', 'trinucleotide', 'coefficient')) nuc_to_pos_dict[k] = df['pos'].values p = df['coefficient'].values cdf = np.cumsum(p) cdf /= cdf[-1] nuc_to_cumsum_dict[k] = cdf # read in cancer gene file logger.info('reading cancer_gene_dictionary...') pos_to_codon_dict = {} pos_to_gene_dict = {} tmp_Chr_pos = '' with open('db/cancergene_pos_list_final.txt', 'r') as hin: for line in hin: F = line.rstrip('\n').split(' ') if len(F) < 4: continue if opts['gene'] and F[0] != opts['gene']: continue if len(F[3]) != 3 | len(F[3]) != 11: logger.info("codon frame error...: {0}, {1}, {2}".format( F[0], F[1], F[3])) sys.exit() pos_to_gene_dict.setdefault(F[1], []).append(F[0]) pos_to_codon_dict.setdefault(F[1], []).append(';'.join( [F[2], F[3], F[4]])) hin.close() pos_to_gene_dict_keys = pos_to_gene_dict.keys() ############################# # modify mutation dataframe ############################# # get mutation df mut_df = pd.read_csv(opts['maf_file'], sep='\t') orig_num_mut = len(mut_df) # rename columns to fit my internal column names rename_dict = { 'Hugo_Symbol': 'Gene', 'Tumor_Sample_Barcode': 'Tumor_Sample', 'Tumor_Seq_Allele2': 'Tumor_Allele', 'Tumor_Seq_Allele': 'Tumor_Allele', } mut_df.rename(columns=rename_dict, inplace=True) # drop rows with missing info na_cols = [ 'Gene', 'Reference_Allele', 'Tumor_Allele', 'Start_Position', 'Chromosome' ] mut_df = mut_df.dropna(subset=na_cols) logger.info('Kept {0} mutations after droping mutations with missing ' 'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df))) if opts['gene']: mut_df = gene_analysis(mut_df, opts['gene'], pos_to_nuc_keys, opts['maf_file']) ############################# #. SNV dataframe ############################# # select valid single nucleotide variants only and corrects for 1-based coordinates!! (important) snv_df = filtering.snv_mutation_df(mut_df, opts['unique']) # get chromosome-position snv_df['Chrom_Pos'] = snv_df['Chromosome'] + ':' + snv_df[ 'Start_Position'].astype(str) # remove SNVs of non-coding regions orig_len = len(snv_df) snv_df = snv_df[snv_df['Chrom_Pos'].isin(pos_to_nuc_keys)] after_len = len(snv_df) log_msg = ('Dropped {num_dropped} non-coding SNV mutations.'.format( num_dropped=orig_len - after_len)) logger.info(log_msg) ############################# #. SNV check ############################ # get trincleotide context snv_df['trinucleotide'] = snv_df['Chrom_Pos'].apply( lambda x: pos_to_nuc[x]) snv_df['trinucleotide'] = snv_df['trinucleotide'].astype('category') snv_df['Chrom_Pos'] = snv_df['Chrom_Pos'].astype('category') # check if the mutation is in the gene_list snv_df['gene'] = snv_df['Chrom_Pos'].map(pos_to_gene_dict) tmp_snv_df = snv_df.dropna(subset=['gene']) outcome = [] chr_pos = tmp_snv_df.Chrom_Pos.values t_allele = tmp_snv_df.Tumor_Allele.values n_allele = tmp_snv_df.Reference_Allele.values # check if the mutation is synonymous / non-synonymous / splice site for idx in range(tmp_snv_df.shape[0]): tmp_outcome = [] # there are genes with different reading frames for item in pos_to_codon_dict[chr_pos[idx]]: pos_in_codon = item.split(';')[0] codon_seq = item.split(';')[1] strand = item.split(';')[2] if pos_in_codon == 'splice_site': tmp_outcome.append('splice_site') continue # check if base change causes amino acid change codon_seq_list = list(codon_seq) if codon_seq_list[int(pos_in_codon)] == n_allele[idx]: codon_seq_list[int(pos_in_codon)] = t_allele[idx] elif codon_seq_list[int(pos_in_codon)] == utils.reverse_complement( n_allele[idx]): codon_seq_list[int(pos_in_codon)] = utils.reverse_complement( t_allele[idx]) else: print "error: " + chr_pos[idx] + pos_to_codon_dict[ chr_pos[idx]] new_codon_seq = ''.join(codon_seq_list) if codon_table[codon_seq] == codon_table[new_codon_seq]: tmp_outcome.append('synonymous') else: tmp_outcome.append('non-synonymous') outcome.append(':'.join(tmp_outcome)) tmp_snv_df['outcome'] = outcome tmp_snv_df['original'] = 'original' tmp_snv_df.to_csv(opts['output_prefix'] + '.final_snv_result.csv', columns=[ 'Tumor_Sample', 'original', 'Gene', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Allele', 'Chrom:Pos', 'gene', 'outcome' ], index=False) ############################# #. SNV simulation ############################ max_num_sim = opts['simulation_number'] # number of simulations for num_sim in range(max_num_sim): log_msg = ('Performing simulation {num_simulation}...'.format( num_simulation=num_sim + 1)) logger.info(log_msg) # randomization trinuc = snv_df['trinucleotide'].values new_pos_list = [] for idx in range(snv_df.shape[0]): wr = weighted_choice(nuc_to_pos_dict[trinuc[idx]], nuc_to_cumsum_dict[trinuc[idx]]) new_pos_list.append(wr) snv_df['New_chr_pos'] = new_pos_list # check if new chr_pos is in gene_list snv_df['New_gene'] = snv_df['New_chr_pos'].map(pos_to_gene_dict) tmp_snv_df = snv_df.dropna(subset=['New_gene']) #print tmp_snv_df outcome = [] chr_pos = tmp_snv_df.New_chr_pos.values t_allele = tmp_snv_df.Tumor_Allele.values n_allele = tmp_snv_df.Reference_Allele.values for idx in range(tmp_snv_df.shape[0]): tmp_outcome = [] for item in pos_to_codon_dict[chr_pos[idx]]: pos_in_codon = item.split(';')[0] codon_seq = item.split(';')[1] strand = item.split(';')[2] if pos_in_codon == 'splice_site': tmp_outcome.append('splice_site') continue codon_seq_list = list(codon_seq) if codon_seq_list[int(pos_in_codon)] == n_allele[idx]: codon_seq_list[int(pos_in_codon)] = t_allele[idx] elif codon_seq_list[int( pos_in_codon)] == utils.reverse_complement( n_allele[idx]): codon_seq_list[int( pos_in_codon)] = utils.reverse_complement( t_allele[idx]) else: print "error: " + chr_pos[idx] + pos_to_codon_dict[ chr_pos[idx]] new_codon_seq = ''.join(codon_seq_list) if codon_table[codon_seq] == codon_table[new_codon_seq]: tmp_outcome.append('synonymous') else: tmp_outcome.append('non-synonymous') outcome.append(':'.join(tmp_outcome)) tmp_snv_df['New_outcome'] = outcome tmp_snv_df['simulation_num'] = 'simulation' + str(int(num_sim) + 1) tmp_snv_df.to_csv(opts['output_prefix'] + '.final_snv_result.csv', columns=[ 'Tumor_Sample', 'simulation_num', 'Gene', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Allele', 'New_chr_pos', 'New_gene', 'New_outcome' ], mode='a', header=False, index=False) log_msg = ('Successfully finished. gene:{gene}, maf:{maf}'.format( gene=opts['gene'], maf=opts['maf_file'])) logger.info(log_msg)
def context_generator(fa_file, chroms, min_length=3, max_length=5, padding=1): """ Creates context and target k-mers using provided fasta and fasta index file. Using a 1 base sliding window approach with random k-mer sizes between min and max length. Both polarities are sampled randomly. E.g. min_length=3, max_length=5, padding=1 rnd_kmer_sizes = [4, 3, 5] CATATCA -> ['CATA', 'ATA', 'TATCA'] -> ('chr?', 'ATA', ['CATA', 'TATCA']) DNA sequences will be converted into ints for the final result -> ('chr?', 12, [140, 1140]) Args: fa_file (str): Path to fasta file with with accompanying Samtools index file (*.fai). chroms (list): Orded list of chromosome/parent ids which will be included when iterating over the fasta file. min_length (int): Minimal allowed kmer size (nt). max_length (int): Maximum allowed kmer size (nt). padding (int): Number of kmers, on each side, added to the context. Yields: chromosom_id (str), target_seq (int), list(context_seqs (ints)) """ kmer_sizes = np.arange(min_length, max_length + 1) with pysam.FastaFile(fa_file) as ref: for chrom in chroms: chr_seq = ref.fetch(chrom) for subseq_pos in range(0, len(chr_seq)): # Create random kmer sizes. rnd_kmer_sizes = np.random.choice(kmer_sizes, padding * 2 + 1) # Extract sub-sequence from provided fasta file. subseq = chr_seq[subseq_pos:subseq_pos + rnd_kmer_sizes.size + rnd_kmer_sizes.max()] if len(subseq) < rnd_kmer_sizes.size + rnd_kmer_sizes.max(): continue # Randomly use both strand for learning (Data Augmentation). if np.random.randint(2): subseq = reverse_complement(subseq) try: num_kmers = [] for i, pos in enumerate(rnd_kmer_sizes): kmer_seq = subseq[i:i + rnd_kmer_sizes[i]] number_seq = multisize_patten2number( kmer_seq, min_length, max_length) num_kmers.append(number_seq) context = np.array(num_kmers[:padding] + num_kmers[-padding:]) # np.random.shuffle(context) target = num_kmers[padding] yield chrom, target, context except (KeyError, IndexError, ValueError): # as e: pass # Was not able to convert patten to number or
def gencode_codon_list(target_gene, output): gencode_df = pd.read_table('db/gencode_coding.modified.bed', names=('chr', 'start', 'end', 'ID', 'type', 'strand', 'gene', 'order', 'sum')) gencode_df = gencode_df[gencode_df['gene'] == target_gene] gencode_df = gencode_df[gencode_df['chr'] != 'chrY'] ID_list = gencode_df['ID'].unique() # select only unique ID ID_list = list(set(ID_list)) # get sequence for each refID hin = open("db/gencode_coding.modified.bed", 'r') gene_seq_dict = {} for line in hin: F = line.rstrip('\n').split('\t') chrom = F[0].replace('chr','') if chrom not in chroms: continue if F[4] != "coding": continue coding_start = int(F[1]) coding_end = int(F[2]) strand = F[5] ID = F[3] gene = F[6] if gene != target_gene: continue for item in ID_list: if item not in gene_seq_dict: gene_seq_dict[item] = '' if ID == item: exon_seq = fa.fetch(reference=chrom, start=coding_start, end =coding_end).upper() gene_seq_dict[item] = gene_seq_dict[item] + exon_seq hin.close() hout = open(output, 'a') # get codon information from refgene hin = open("db/gencode_coding.modified.bed", 'r') exon_length_dict = {} for line in hin: F = line.rstrip('\n').split('\t') chrom = F[0].replace('chr','') if chrom not in chroms: continue if F[4] == "coding": coding_start = int(F[1]) coding_end = int(F[2]) strand = F[5] ID = F[3] gene = F[6] if gene != target_gene: continue for item in ID_list: if item != ID: continue if item not in exon_length_dict: exon_length_dict[item] = 0 if strand == '+': for pos in range(coding_end - coding_start): # relative pos in the gene pos2 = pos + exon_length_dict[item] codon_pos = pos2 // 3 codon_start = codon_pos * 3 pos_in_codon = pos2 % 3 print >> hout, gene, item, ':'.join([chrom, str(coding_start + pos)]), pos_in_codon, gene_seq_dict[item][codon_start:(codon_start+3)], strand exon_length_dict[item] = exon_length_dict[item] + (coding_end - coding_start) if strand == '-': for pos in range(coding_end - coding_start): # relative pos in the gene pos2 = pos + exon_length_dict[item] codon_pos = pos2 // 3 codon_start = codon_pos * 3 pos_in_codon = pos2 % 3 print >> hout, gene, item, ':'.join([chrom, str(coding_start + pos)]), 2 - (pos_in_codon), utils.reverse_complement(gene_seq_dict[item][codon_start:(codon_start+3)]), strand exon_length_dict[item] = exon_length_dict[item] + (coding_end - coding_start) elif F[4] == "intron": start = int(F[1]) end = int(F[2]) strand = F[5] ID = F[3] gene = F[6] if gene != target_gene: continue for pos in (start, start + 1, end - 2, end - 1): print >> hout, gene, ID, ':'.join([chrom, str(pos)]), 'splice_site', 'splice_site', strand hin.close() hout.close()
def count_read_by_alignment(input_file, bam_file, reference, output_file, validate_sequence_length=200, score_ratio_thres=1.4, start_pos_thres=0.2, end_pos_thres=0.8): class Alignment_counter(object): def __init__(self, score_ratio_thres, start_pos_thres, end_pos_thres): self.key = '' # self.hout = open(output_file, 'w') self.query_seq = None self.target_seq_list = [] self.score_ratio_thres = score_ratio_thres self.start_pos_thres = start_pos_thres self.end_pos_thres = end_pos_thres self.tmp_dir = tempfile.mkdtemp() def __del__(self): # self.hout.close() shutil.rmtree(self.tmp_dir) def initialize(self, key, query_seq): self.key = key self.query_seq = query_seq self.target_seq_list = [] def add_query_seq(self, read_name, read_seq): self.target_seq_list.append((read_name, read_seq)) def count_alignment(self): if len(self.target_seq_list) == 0: return None cluster_id, _, _, _ = key.split('\t') """ with open(self.tmp_dir + '/' + cluster_id + ".target.fa", 'w') as hout_ta: for read in self.target_seq_list: print(">%s\n%s" % (read[0], read[1]), file = hout_ta) with open(self.tmp_dir + '/' + cluster_id + ".query.fa", 'w') as hout_qu: print(">query_%s\n%s" % (cluster_id, self.query_seq), file = hout_qu) alignment_info = nanomonsv.long_read_validate.ssw_check( self.tmp_dir + '/' + cluster_id + ".query.fa", self.tmp_dir + '/' + cluster_id + ".target.fa") all_rnames = list(set(alignment_info.keys())) supporting_reads = [rname for rname in all_rnames if \ alignment_info[rname][0] > self.score_ratio_thres * len(self.query_seq) and \ alignment_info[rname][1] < self.start_pos_thres * len(self.query_seq) and \ alignment_info[rname][2] > self.end_pos_thres * len(self.query_seq)] if "15033196" in self.key: for a in alignment_info: print(a, alignment_info[a], a in supporting_reads) print(' ') """ # print(cluster_id) all_rnames = [] supporting_reads = [] align_res = [] for target_read in self.target_seq_list: all_rnames.append(target_read[0]) tres = edlib.align(self.query_seq, target_read[1], mode="HW", task="locations") # print(target_read[0]) # print(tres) if tres["editDistance"] < len(self.query_seq) * 0.25: # and \ # tres["locations"][0][0] < 0.2 * len(self.query_seq) and \ # tres["locations"][0][1] > 0.8 * len(self.query_seq): supporting_reads.append(target_read[0]) return (len(all_rnames), len(supporting_reads)) bam_hin = pysam.AlignmentFile(bam_file, 'rb') reference_fasta = pysam.FastaFile(reference) rname2key = {} key2contig = {} with open(input_file, 'r') as hin: for row in csv.reader(hin, delimiter='\t'): key = '\t'.join(row[:4]) for read in bam_hin.fetch(row[1], max(int(row[2]) - 100, 0), int(row[2]) + 100): if read.is_secondary: continue if read.qname not in rname2key: rname2key[read.qname] = [] rname2key[read.qname].append(key) if row[3] == '+': contig = reference_fasta.fetch( row[1], max(int(row[2]) - validate_sequence_length - 1, 0), int(row[2])) else: contig = reference_fasta.fetch( row[1], int(row[2]) - 1, int(row[2]) + validate_sequence_length - 1) contig = reverse_complement(contig) contig = contig + row[4][:validate_sequence_length] key2contig[key] = contig for rname in rname2key: keys = list(set(rname2key[rname])) rname2key[rname] = keys with open(output_file + ".tmp.long_read_seq.unsorted", 'w') as hout: for read in bam_hin.fetch(): if read.is_secondary or read.is_supplementary: continue if read.qname in rname2key: read_seq = read.query_sequence read_seq = reverse_complement( read.query_sequence ) if read.is_reverse else read.query_sequence for key in rname2key[read.qname]: print("%s\t%s\t%s" % (key, read.qname, read_seq), file=hout) bam_hin.close() with open(output_file + ".tmp.long_read_seq.sorted", 'w') as hout: subprocess.check_call( ["sort", "-k1,1", output_file + ".tmp.long_read_seq.unsorted"], stdout=hout) os.remove(output_file + ".tmp.long_read_seq.unsorted") # key2count = {} alignment_counter = Alignment_counter(score_ratio_thres, start_pos_thres, end_pos_thres) with open(output_file + ".tmp.long_read_seq.sorted", 'r') as hin, open(output_file, 'w') as hout: for row in csv.reader(hin, delimiter='\t'): key = '\t'.join(row[:4]) if key != alignment_counter.key: acount = alignment_counter.count_alignment() if acount is not None: print("%s\t%d\t%d" % (alignment_counter.key, acount[0], acount[1]), file=hout) # key2count[alignment_counter.key] = acount alignment_counter.initialize(key, key2contig[key]) alignment_counter.add_query_seq(row[4], row[5]) acount = alignment_counter.count_alignment() if acount is not None: print("%s\t%d\t%d" % (alignment_counter.key, acount[0], acount[1]), file=hout) # key2count[alignment_counter.key] = acount del alignment_counter os.remove(output_file + ".tmp.long_read_seq.sorted")
import neural_network as nn import utils as util from random import shuffle import numpy as np import random #read in sites posfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-positives.txt' negfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/yeast-upstream-1k-negative.fa' testfile = '/Users/student/Documents/Algorithms/Alg_final_project/data/rap1-lieb-test.txt' poslist = util.read_pos(posfile) finaltestlist = util.read_pos(testfile) posreversecomp = [] for i in poslist: posreversecomp.append(util.reverse_complement(i)) poslist = poslist + posreversecomp neglist = util.read_fasta(negfile) negreversecomp = [] for i in neglist: negreversecomp.append(util.reverse_complement(i)) neglist = neglist + negreversecomp for i in neglist: if i in set(poslist): neglist.remove(i) #print('negs',neglist[:10]) print('neg', len(neglist))