def test_read_fasta_protein(protein: Path, protein_gzip: Path): expected = """ QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNNSFNVATLPAE KMKILELPFASGDLSMLVLLPDEVSDLERIEKTINFEKLTEWTNPNTMEKRRVKVYLPQMKIEEKYNLTS VLMALGMTDLFIPSANLTGISSAESLKISQAVHGAFMELSEDGIEMAGSTGVIEDIKHSPESEQFRADHP FLFLIKHNPTNTIVYFGRYWSP """.replace("\n", "").strip() with read_fasta(protein) as file: item = file.read_item() assert item.defline == "P01013 GENE X PROTEIN (OVALBUMIN-RELATED)" assert item.has_desc assert item.desc == "GENE X PROTEIN (OVALBUMIN-RELATED)" assert item.id == "P01013" assert item.sequence == expected assert len(list(read_fasta(protein))) == 1 with read_fasta(protein_gzip) as file: item = file.read_item() assert item.defline == "P01013 GENE X PROTEIN (OVALBUMIN-RELATED)" assert item.has_desc assert item.desc == "GENE X PROTEIN (OVALBUMIN-RELATED)" assert item.id == "P01013" assert item.sequence == expected assert len(list(read_fasta(protein))) == 1
def test_write_fasta(tmp_path: Path): defline = ["defline1", "defline2 description"] sequence = ["ABCD", "ABCD" * 100] os.chdir(tmp_path) with write_fasta("output.faa") as writer: writer.write_item(defline[0], sequence[0]) writer.write_item(defline[1], sequence[1]) with read_fasta("output.faa") as reader: item = reader.read_item() assert item.defline == defline[0] assert item.sequence == sequence[0] item = reader.read_item() assert item.defline == defline[1] assert item.sequence == sequence[1] with write_fasta("output.faa.xz") as writer: writer.write_item(defline[0], sequence[0]) writer.write_item(defline[1], sequence[1]) with read_fasta("output.faa.xz") as reader: item = reader.read_item() assert item.defline == defline[0] assert item.sequence == sequence[0] item = reader.read_item() assert item.defline == defline[1] assert item.sequence == sequence[1]
def extract_sequences_2(path, out_folder, input_beds): # path = 'data/genomes/' # out_folder = 'sdregions8' # input_beds = 'same8' #first we create dict for genome(s): main_fa_dict = dict() if os.path.isdir(path): for i in os.listdir(path): path_ = i.split('_') # print (path_) if len(path_) > 2 and path_[1] == 'hard' and path_[2] == '50.fa': specie = path_[0] # 'hg19' # print (specie) # continue for item in read_fasta(f"{path}{specie}_hard_50.fa"): main_fa_dict[f'{specie}#{item.defline}'] = item.sequence # print(item) # print (len(main_fa_dict)) else: specie = path.split('/')[-1].split('_')[0] # print (specie, path) for item in read_fasta(f"{path}"): # print (f'{specie}#{item.defline}') main_fa_dict[f'{specie}#{item.defline}'] = item.sequence # Now we iterate trought different colors files and count_array = [] num_ov_neg = 0 for i in os.listdir(input_beds): # print (i) modify_sequences(main_fa_dict, f'{input_beds}/{i}') # sys.exit(1) new_fa = open(f'{out_folder}/{i}.fa', 'w') d__ = dict() d__, i__, j__ = build_dict_2(f'{input_beds}/{i}', dict()) count_array.append((i__, j__)) # print (f'Dictionary built {i__}, {j__}') for chr_ in d__: for interval in d__[chr_]: if chr_[-1] == '+': new_fa.write( f'>{chr_}-{interval.begin}-{interval.end}\n{main_fa_dict[chr_[:-1]][interval.begin : interval.end]}\n' ) else: num_ov_neg += 1 new_fa.write( f'>{chr_}-{interval.begin}-{interval.end}\n{Seq(main_fa_dict[chr_[:-1]][interval.begin : interval.end]).reverse_complement()}\n' ) new_fa.close() count_array.sort(reverse=True) print(f'Num of rev comp: {num_ov_neg}') print('10 biggest clusters: (#of sequences, coverage)') for i in count_array[:10]: print(i)
def _test_read_fasta_correct(filepath: Path): deflines = ["ID1", "ID2", "ID3", "ID4"] sequences = ["GAGUUA", "CAUAACAAATT", "AAGAA", "AAGAA"] f = read_fasta(filepath) item = f.read_item() assert item.defline == deflines[0] assert item.id == deflines[0] assert not item.has_desc assert item.sequence == sequences[0] item = f.read_item() assert item.defline == deflines[1] assert item.id == deflines[1] assert not item.has_desc assert item.sequence == sequences[1] item = f.read_item() assert item.defline == deflines[2] assert item.id == deflines[2] assert not item.has_desc assert item.sequence == sequences[2] item = f.read_item() assert item.defline == deflines[3] assert item.id == deflines[3] assert not item.has_desc assert item.sequence == sequences[3] with pytest.raises(StopIteration): f.read_item() f.close() f = read_fasta(filepath) for i, item in enumerate(f): assert item.defline == deflines[i] assert item.sequence == sequences[i] f.close() f = read_fasta(filepath) items = f.read_items() for i, defline in enumerate(deflines): assert items[i].defline == defline assert items[i].sequence == sequences[i] f.close() with read_fasta(filepath) as f: for i, item in enumerate(f): assert item.defline == deflines[i] assert item.sequence == sequences[i] with read_fasta(filepath) as f: f.close()
def get_data(filename: str): fasta_gz = pkg_resources.read_binary(dcp.test, f"{filename}.fasta.gz") fasta = gzip.decompress(fasta_gz).decode() json_gz = pkg_resources.read_binary(dcp.test, f"{filename}.json.gz") data = json.loads(gzip.decompress(json_gz)) desired = {} for d in data: key = (d["multiple_hits"], d["hmmer3_compat"], d["target"], d["profile"]) desired[key] = { "alt_loglik": d["alt_loglik"], "alt_path": d["alt_path"], "alt_codon_stream": d["alt_codon_stream"], "alt_amino_stream": d["alt_amino_stream"], "null_loglik": d["null_loglik"], "null_path": d["null_path"], "null_codon_stream": d["null_codon_stream"], "null_amino_stream": d["null_amino_stream"], } return { "hmm": dcp.example.get(f"{filename}.hmm"), "targets": fr.read_fasta(StringIO(fasta)).read_items(), "desired": desired, }
def extract_sequences(path, out_folder, input_beds): # path = 'data/genomes/' # out_folder = 'sdregions8' # input_beds = 'same8' for i in os.listdir(path): main_fa_dict = dict() path_ = i.split('_') # print (path_) if len(path_) > 2 and path_[1] == 'hard' and path_[2] == '50.fa': specie = path_[0] # 'hg19' print(specie) # continue new_fa = open(f'{out_folder}/SD_regions_{specie}.fa', 'w') for item in read_fasta(f"{path}{specie}_hard_50.fa"): main_fa_dict[f'{specie}#{item.defline}'] = item.sequence # print(item) print(len(main_fa_dict)) d__ = dict() d__, i__, j__ = build_dict( f'{input_beds}/{specie}_{specie}/aligned/', dict()) # print (f'Dictionary built {i__}, {j__}') for i in d__: for interval in d__[i]: new_fa.write( f'>{i}-{interval.begin}-{interval.end}\n{main_fa_dict[i][interval.begin : interval.end]}\n' ) new_fa.close()
def open_read_frames(fasta_file, format_print = False): dna_strands = read_fasta(fasta_file) dna_strand = next(iter(dna_strands.values())) # Take into account the reverse complement dna_strand_rev = reverse_complement(dna_strand) stop_codons = [k for k,v in dna_codon.items() if v == 'Stop'] # (M)ethionine is considered to be the start codon start_codon = [k for k,v in dna_codon.items() if v == 'M'][0] # Elements between Methionine and stop codons can only be triplets pattern = f'(({start_codon})(...)*?)({"|".join(stop_codons)})' all_cand_prot = [] for strand in [dna_strand, dna_strand_rev]: candidate_dna = [g[0] for g in re.findall(pattern, strand, overlapped = True)] if format_print: print(candidate_dna) candidate_prot = [translate_dna_to_prot(cand) for cand in candidate_dna] all_cand_prot = list(set(all_cand_prot + candidate_prot)) if format_print: print('\n'.join(all_cand_prot)) [print(len(cand)) for cand in all_cand_prot] return all_cand_prot
def consensus_profile(fasta_file, format_print=False): dna_dict = read_fasta(fasta_file) dna_strands = list(dna_dict.values()) base_position = {0: 'A', 1: 'C', 2: 'G', 3: 'T'} profile_dict = { 'A': [0] * len(dna_strands[0]), 'C': [0] * len(dna_strands[0]), 'G': [0] * len(dna_strands[0]), 'T': [0] * len(dna_strands[0]) } for strand in dna_strands: for position, nucleotide in enumerate(strand): profile_dict[nucleotide][position] += 1 profile_matrix = np.matrix(list(profile_dict.values())) consensus = '' for col in profile_matrix.T: # Get the position of most frequent symbol for each column (j) position = np.argmax(col) consensus += base_position[position] if format_print: print(consensus) for idx, elem in enumerate(profile_dict.values()): print(base_position[idx] + ': ' + ' '.join([str(int) for int in elem])) return profile_dict, consensus
def _load_fasta(rna_fasta): rna_ids = set() with read_fasta(rna_fasta) as fh: for rec in fh: _load_fasta_rec(rna_ids, rec) if len(rna_ids) == 0: raise LrgaspException("no RNA sequences in FASTA") return frozenset(rna_ids)
def correct_cdr(cdr_stream, seqs_stream, threshold, out_stream): full_seqs = fr.read_fasta(seqs_stream) clusters = fr.read_cluster(cdr_stream) logger.info("Cdr correction started, {0} cdr clusters".format(len(clusters))) cl_id = 0 for c_name, c_seqs in clusters.iteritems(): for newc_name, newc_seqs in correct_cluster(c_seqs, full_seqs, threshold).iteritems(): out_cluster("Cluster_{0}".format(cl_id), newc_seqs, full_seqs, out_stream) cl_id += 1 logger.info("Cdr correction finished, {0} cdr clusters".format(cl_id))
def test_read_fasta_damaged(damaged1, damaged2, damaged3): with read_fasta(damaged1) as f: with pytest.raises(ParsingError) as excinfo: f.read_item() e: ParsingError = excinfo.value assert e.line_number == 1 with read_fasta(damaged2) as f: with pytest.raises(ParsingError) as excinfo: f.read_item() e: ParsingError = excinfo.value assert e.line_number == 2 with read_fasta(damaged3) as f: f.read_item() with pytest.raises(ParsingError) as excinfo: f.read_item() e: ParsingError = excinfo.value assert e.line_number == 4
def remove_dups(stream_in, stream_out): seqs = fr.read_fasta(stream_in) logger.info("Removing duplicates from {0} sequences".format(len(seqs))) counter = 0 new_seqs = {} seq_count = {} for h in seqs: seq_count[seqs[h]] = seq_count.get(seqs[h], 0) + 1 for s in seq_count: new_seqs["Seq{0}_{1}".format(counter, seq_count[s])] = s counter += 1 fr.write_fasta(new_seqs, stream_out) logger.info("Done, {0} sequences left".format(len(new_seqs)))
def rna_splice(fasta_file): dna_strands = iter(read_fasta(fasta_file).values()) # The first value is DNA, the rest are introns dna_strand = next(dna_strands) introns = list(dna_strands) for intron in introns: dna_strand = dna_strand.replace(intron, '') rna = transcribe_dna_to_rna(dna_strand) prot = translate_rna_into_protein(rna) print(prot) return prot
def muscle(fasta_dict): logger.debug("Running muscle for {0} seqs".format(len(fasta_dict))) in_file = tempfile.SpooledTemporaryFile() out_file = tempfile.SpooledTemporaryFile() fr.write_fasta(fasta_dict, in_file) in_file.flush() in_file.seek(0) cmdline = [MUSCLE_EXEC, "-diags", "-maxiters", "2", "-quiet"] proc = subprocess.Popen(cmdline, stdin=in_file, stdout=out_file) proc.wait() out_file.flush() out_file.seek(0) logger.debug("Muscle finished") return fr.read_fasta(out_file)
def overlap_graphs(fasta_file, k, format_print = False): dna_strands = read_fasta(fasta_file) adj_list = [] for k_s, s in dna_strands.items(): other_strands = dict(dna_strands) other_strands.pop(k_s) for k_t, t in other_strands.items(): if (s[-k:] == t[:k]): if format_print: print(k_s + ' ' + k_t) adj_list.append((k_s, k_t)) return adj_list
def locate_restriction_sites(fasta_file, format_print=False): dna_strands = read_fasta(fasta_file) dna_strand = next(iter(dna_strands.values())) rev_comp = reverse_complement(dna_strand) locations = [] for idx, elem in enumerate(dna_strand): # The reverse palindromes can be between 4 and 12 chars for n in range(12, 4 - 1, -1): forw = dna_strand[idx:idx + n] reve = rev_comp[-(idx + 1):-(idx + n + 1):-1][::-1] if len(forw) == n and forw == reve: hit = f'{idx+1} {n}' locations.append(hit) if format_print: print(hit) break return locations
def find_cdr3(in_stream, start_seqs, end_seqs, out_stream): MIN_CDR_LEN = 30 MAX_CDR_LEN = 90 MATCH = 2 MISSMATCH = -2 INDEL = -1 logger.info("Finding cdr regions started".format(MIN_CDR_LEN, MAX_CDR_LEN)) seqs = fr.read_fasta(in_stream) start_align = defaultdict(list) end_align = defaultdict(list) for qry in start_seqs: for h, alns in ext_tools.xalign(seqs, qry, MATCH, MISSMATCH, INDEL).iteritems(): start_align[h].extend(alns) for qry in end_seqs: for h, alns in ext_tools.xalign(seqs, qry, MATCH, MISSMATCH, INDEL).iteritems(): end_align[h].extend(alns) for h, seq in seqs.iteritems(): candidates = [] for start in start_align[h]: for end in end_align[h]: dist = end.begin - start.begin if (MIN_CDR_LEN <= dist and dist <= MAX_CDR_LEN and start.begin > len(seq) / 2): candidates.append(AlignInfo(start.begin, end.begin - 1, start.score + end.score)) if len(candidates) == 0: logger.debug("{0} : no cdr found".format(h)) continue max_score = 0 cand = None for c in candidates: if c.score > max_score: cand = c max_score = c.score cdr = seq[cand.begin : cand.end + 1] out_stream.write(">{0}\n{1}\n".format(h, cdr)) logger.debug("{0} : cdr found with score {1}".format(h, max_score)) logger.info("Finding cdr regions finished")
def find_longest_shared_motif(fasta_file, format_print = False): dna_strands = read_fasta(fasta_file) # Get shortest strand in collection # Get all substrings starting from longest to shortest # For each one try to find them in other strands # If it is not found in one, stop loop and go to next string dna_strings = list(dna_strands.values()) shortest_string = min(dna_strings, key = len) dna_strings.remove(shortest_string) # Takes about 3m, could improve by checking each substring wo combs list for n_chars in range(len(shortest_string), 1, -1): n_char_combs = ordered_combinations(shortest_string, n_chars) for substring in n_char_combs: pattern_found = check_for_pattern(dna_strings, substring) if pattern_found: return substring return 'No shared motif found'
def main(): if len(sys.argv) < 2: print("You need to provide inputs. try -h or --help for help") sys.exit() # Log file stuff if not args.log_file: log_file = "out_log.log" else: log_file = args.log_file logging.basicConfig(filename=log_file, filemode='w', format='[%(asctime)s] %(message)s', level=getattr(logging, "INFO")) # taking fasta file if args.in_msa: sequences = read_fasta(args.in_msa) else: logging.error("You did not provide input msa file, -f, check -h for help") sys.exit() # reading sequence names if provided seq_names = dict() if args.seq_names: if os.path.exists(args.seq_names): with open(args.seq_names) as in_file: for raw_l in in_file: line = raw_l.strip().split("\t") if line[0] not in sequences: logging.error("The sequence {} in file {} does not exist in the " "fasta file {} provided".format(line[0], args.seq_names, args.in_msa)) else: seq_names[line[0]] = line[1] else: logging.error("File {} provided as sequence names tsv does not exist".format(args.seq_names)) else: for key in sequences.keys(): seq_names[key] = key # building graph logging.info("constructing graph...") graph = msa_graph(sequences, seq_names) graph.colors = seq_names if args.compact: logging.info("compacting linear paths in graph...") # Compacting just merges stretches of single nodes together graph.compact() logging.info("sorting the graph toplogocially...") # I use topological sorting to write the paths in order graph.sort() # topological sorting logging.info("adding paths to graph...") graph.add_paths() # adds paths to graph logging.info("writing graph...") write_gfa(graph, args.out_gfa) # outputting if args.nodes_dict: logging.info("writing nodes info json file...") graph.nodes_info(args.nodes_dict) logging.info("finished...")
from fasta_reader import read_fasta def LCS(text): for k in range(len(text[0]), 1, -1): for start in range(len(text[0]) - k + 1): curr = text[0][start:start + k] found = True for i in range(1, len(text)): if not curr in text[i]: found = False break if found: return curr if __name__ == '__main__': _, sequences = read_fasta(r'D:/Downloads/rosalind_lcsm.txt') print(LCS(sequences))
def hmmer_filter(db_filepath): hmmer = HMMER(db_filepath) hmmer.timeout = 60 if not hmmer.is_indexed: hmmer.index() heuristic = True cut_ga = True targets = fr.read_fasta("oamino.fasta.bak").read_items() for i, tgt in enumerate(targets): assert tgt.id == f"item{i+1}" gffs = gff_io.read_gff("output.gff.bak").read_items() acc_targets = {} for gff in gffs: di = gff.attributes_asdict() acc = di["Profile_acc"] if acc not in acc_targets: acc_targets[acc] = [] j = int(di["ID"][4:]) - 1 acc_targets[acc].append(targets[j]) scores = {} for acc, tgts in acc_targets.items(): txt = [">" + t.defline + "\n" + t.sequence for t in tgts] result = hmmer.search( StringIO("\n".join(txt)), "/dev/null", tblout=True, heuristic=heuristic, cut_ga=cut_ga, hmmkey=acc, Z=1, ) for row in result.tbl: e_value = row.full_sequence.e_value score = row.full_sequence.score if score.lower() == "nan": continue bias = row.full_sequence.bias itemid = row.target.name scores[itemid] = (e_value, score, bias) itemid_convert = {} j = 1 with open("output.gff", "w") as out2: out2.write("##gff-version 3\n") for gff in gffs: di = gff.attributes_asdict() score = scores.get(di["ID"], None) if score is None: continue itemid_convert[di["ID"]] = f"item{j}" attrs = gff.attributes.replace(di["ID"], f"item{j}") cols = [ gff.seqid, gff.source, gff.type, str(gff.start), str(gff.end), str(gff.score), gff.strand, str(gff.phase), attrs + f";Bias={score[2]};E-value={score[0]};Score={score[1]}", ] out2.write("\t".join(cols)) out2.write("\n") j += 1 with fr.write_fasta("oamino.fasta") as f: for tgt in targets: if tgt.defline not in itemid_convert: continue f.write_item(itemid_convert[tgt.defline], tgt.sequence) ctargets = fr.read_fasta("ocodon.fasta.bak").read_items() for i, tgt in enumerate(ctargets): assert tgt.id == f"item{i+1}" with fr.write_fasta("ocodon.fasta") as f: for tgt in ctargets: if tgt.defline not in itemid_convert: continue f.write_item(itemid_convert[tgt.defline], tgt.sequence)
from trace_back import traceback if len(sys.argv) != 6: print "\n\nERROR: incorrect arguments given to script.\n\nUSAGE: sh SDS_BLAST.sh <sequence_1.fasta> <sequence_2.fasta> mismatch_score gap_open_Penalty gap_extension_penalty\n\nEXAMPLE: sh SDS_BLAST.sh Data/seq_11.fasta Data/seq_22.fasta -20 40 2\n\n" sys.exit(1) seqs1_inputs = str(sys.argv[1]) seqs2_inputs = str(sys.argv[2]) match = int(10) mismatch = int(sys.argv[3]) gap_open = int(sys.argv[4]) gap_extend = int(sys.argv[5]) gap_init_penalty = gap_open + gap_extend seqs1_headers, seqs1 = read_fasta(seqs1_inputs) seqs2_headers, seqs2 = read_fasta(seqs2_inputs) S, D, I, score, start_A, start_B = populate_matrices(seqs1[0], seqs2[0], match, mismatch, gap_open, gap_extend) align_1, align_2, end_A, end_B, num_matches, num_mismatches, num_gaps, middle_array = traceback( S, D, I, seqs1[0], seqs2[0], gap_init_penalty, start_A, start_B) print "\n" + "SUCCESSFULLY ALIGNED SEQUENCES:".center( 40) + "\n\n(A) %s by (B) %s \n" % (seqs1_headers[0], seqs2_headers[0]) print "LENGTH_OF_SEQUENCE_A = %s\nLENGTH_OF_SEQUENCE_B = %s\nLENGTH_OF_ALIGNMENT = %s" % ( len(seqs1[0]), len(seqs2[0]), str(len(align_1))) print "START_POS_A = %s\nSTART_POS_B = %s\nEND_POS_A = %s\nEND_POS_B = %s\n" % ( start_A, start_B, end_A, end_B) a_1 = []
from w_code import wcode from id_seq import seq_id if len(sys.argv) != 4: print "\n\nERROR: incorrect arguments given to script.\n\nUSAGE: python <MSA.py> <sequence.fasta> <word_model.txt> wlcut\n\n" sys.exit(1) with open(sys.argv[2], 'r') as wm: word_model = wm.readline().strip() wlcut = int(sys.argv[3]) w = len(word_model) if w < 1: print "\n\nERROR: word_model must be of legnth >= 1" sys.exit(1) seqs_headers, seqs_inputs = read_fasta(sys.argv[1]) C = '' k = len(seqs_inputs) st = [] length = [] p = -1 for seq in seqs_inputs: C += (seq + '#') st.append(p + 1) p += len(seq) + 1 length.append(len(seq)) word_code = wrd_code(C, word_model) positions = range(0, len(C)) sup_wrd = sw_code(positions, word_code, w, wlcut)