def make_query_mapping(query_fasta): mapping_dict = {} fasta_dict = parse_fasta(query_fasta) for ID in fasta_dict: new_ID = ID.split('|')[0] print(new_ID) mapping_dict[ID] = new_ID return mapping_dict
def separate_ref_from_nonref(fasta_dir): ref_fasta_dict = {} nonref_fasta_dict = {} fasta_dict = parse_fasta(fasta_dir) for seq_id, sequence in fasta_dict.items(): seq_id = LongFastaID(seq_id) if seq_id.genome_acc == REFACC: ref_fasta_dict[seq_id.protein_id] = sequence else: nonref_fasta_dict[seq_id.protein_id] = sequence return ref_fasta_dict, nonref_fasta_dict
query_res += 1 if ref_aa != '-': ref_res += 1 if aa != '-' and ref_aa != '-': mapping[query_res] = ref_res return mapping if __name__ == "__main__": blast_results = parse_blast_output(BLAST_OUTPUT) id_to_sequence = parse_fasta(PDB_SEQS_STRUCTURE) ref_to_sequence = parse_fasta(REFERENCE_PROTEOME) ordered_blast_results = order_hits(blast_results) combined_blast_results = combine_all_hits(ordered_blast_results) best_hits = get_best_hits(combined_blast_results) for query, best_hit in best_hits.items(): fasta_dict = {} ref_id = best_hit[1] fasta_dict[query] = id_to_sequence[query] fasta_dict[ref_id] = ref_to_sequence[ref_id] temp_fasta = f'{TEMP}{query}.fasta' write_fasta(fasta_dict, temp_fasta) temp_aligned = f'{TEMP}{query}_aligned.fasta'
""" if sys.argv[1] == '-p' or sys.argv[1] == '--preprocess': """ Preprocess only. """ state = 'preprocess' genome_file = sys.argv[2] print('will preprocess', genome_file) #out_file = '.'.join(genome_file.split('/')[-1].split('.')[0:-1]) + '.pickle' # isolate file name from path and extension. out_file = 'preprocessed_sequences_bw.pickle' # Use the same file name. dictionary = {} # Collects all the objects. for _i, genome in enumerate(parse_fasta(genome_file)): print('\t', _i, ': preprocessing ', genome['title'], sep = '') o = bwt.search_bwt(genome['title'], genome['sequence']) # One object for each genome. o.main_preprocess() dictionary[_i] = o # Save dictionary with objects of all sequences to disk with pickle. with open(out_file, 'wb') as file: pickle.dump(dictionary, file) print() print('Successfully saved to:') print() print('\t' + out_file) print()
if ref_aa != '-': ref_res += 1 if aa != '-' and ref_aa != '-': mapping[query_res] = ref_res return mapping if __name__ == "__main__": blast_results = parse_blast_output(BLAST_OUTPUT) id_to_sequence = parse_fasta(UNIQUE_SEQS) ref_to_sequence = parse_fasta(REFERENCE_PROTEOME) ordered_blast_results = order_hits(blast_results) combined_blast_results = combine_all_hits(ordered_blast_results) best_hits = get_best_hits(combined_blast_results) for query, best_hit in best_hits.items(): fasta_dict = {} ref_id = best_hit[1] fasta_dict[query] = id_to_sequence[query] fasta_dict[ref_id] = ref_to_sequence[ref_id] temp_fasta = f'{TEMP}{query}.fasta' write_fasta(fasta_dict, temp_fasta) temp_aligned = f'{TEMP}{query}_aligned.fasta'
from st import suffixtree from parsers import parse_fasta, parse_fastq import sys genome_file = sys.argv[1] reads_file = sys.argv[2] for genome in parse_fasta(genome_file): for read in parse_fastq(reads_file): st = suffixtree(genome['sequence']) for match in st.find_positions(read['sequence']): print(f"\ {read['sequence']}\t\ 0\t\ {genome['title']}\t\ {match+1}\t\ 0\t\ {len(read['sequence'])}M\t\ *\t\ 0\t\ 0\t\ {read['sequence']}\t\ {len(read['sequence'])*'~'}")
prot_ids = set([]) for id in fasta_dict: prot_ids.add(parse_id_from_prot_file(id)) return prot_ids def extract_protids(fasta_dict): prot_ids = set([]) for id, protdata in fasta_dict.items(): prot_ids.add(protdata.protein_id.split('.')[0]) return prot_ids if __name__ == "__main__": used_fasta = argv[1] other_fasta = argv[2] fasta_dict_1 = parse_fasta(used_fasta) fasta_dict_2 = parse_fasta_simple(other_fasta) prot_ids_1 = extract_protids(fasta_dict_1) prot_ids_2 = extract_protids_simple(fasta_dict_2) print("Extra in used", prot_ids_1 - prot_ids_2) for ID in (prot_ids_2 - prot_ids_1): print(ID)
from writers import write_fasta from sys import argv def get_refseqs(refseq_to_uniprot): refseqs = set([]) for refseq in refseq_to_uniprot: refseqs.add(refseq.split('.')[0]) return refseqs if __name__ == "__main__": fasta = argv[1] refseqs = argv[2] refseq_to_uniprot = parse_mapping(refseqs) refseqs = get_refseqs(refseq_to_uniprot) fasta_dict = parse_fasta(fasta) refseq_to_seq = {} for fasta_id, sequence in fasta_dict.items(): fasta_id = fasta_id.split('|')[0] print(fasta_id) fasta_id = fasta_id.strip() if fasta_id in refseqs: refseq_to_seq[fasta_id] = sequence write_fasta(refseq_to_seq, 'reference_proteome_complete.fasta')
sequence_to_id = {} for fasta_id, sequence in fasta_dict.items(): fasta_id = parse_fasta_id(fasta_id) if not sequence in sequence_to_id: sequence_to_id[sequence] = [] sequence_to_id[sequence].append(fasta_id) return sequence_to_id def assign_code(sequence_to_id): code_to_sequence = {} code_to_accession = {} for i, (sequence, accessions) in enumerate(sequence_to_id.items()): code = 'seq_%.4d' % i code_to_sequence[code] = sequence code_to_accession[code] = accessions return code_to_sequence, code_to_accession if __name__ == "__main__": fasta = argv[1] id_to_sequence = parse_fasta(fasta) sequence_to_id = reverse_fasta_dict(id_to_sequence) code_to_sequence, code_to_accession = assign_code(sequence_to_id) write_fasta(code_to_sequence, UNIQUE_SEQ_DIR) write_code_to_accession(code_to_accession, CODE_DIR)
def make_new_fasta_dict(seq_to_id): new_fasta_dict = {} for seq, seq_id in seq_to_id.items(): new_fasta_dict[seq_id] = seq return new_fasta_dict if __name__ == "__main__": blast_output = argv[1] covid19_fasta = argv[2] orf_mapping = argv[3] covid19_fasta_dict = parse_fasta(covid19_fasta) orf_mapping = parse_mapping(orf_mapping) queryid_to_hits = parse_blast_output(blast_output) sorted_hit_dict = order_hits(queryid_to_hits) combined_hit_dicts = {} for query, subject_to_hits in sorted_hit_dict.items(): combined_hit_dict = combine_hits(subject_to_hits) combined_hit_dicts[query] = combined_hit_dict filtered_hits, identical_hits, rejected_hits = filter_hits(combined_hit_dicts, covid19_fasta_dict) # print(identical_hits.items()) subject_to_queries = map_subject_to_queries(filtered_hits) fasta_dicts = make_fasta_dicts(subject_to_queries, covid19_fasta_dict, orf_mapping)
def run_blastp(in_file): subjects = REFERENCE_PROTEOME queries = in_file command = ['blastp', '-query', queries, '-subject', subjects, '-out', TEMP_BLAST, '-outfmt', "6 qseqid sseqid pident length mismatch qstart qend qlen sstart \ send slen evalue bitscore qcovs"] subprocess.check_call(command) if __name__ = "__main__": fasta = argv[1] unique_id_to_seq = parse_fasta(UNIQUE_SEQ_DIR) unique_seq_to_id = reverse_fasta_dict(unique_id_to_seq) new_id_to_seq = parse_fasta(fasta) new_seq_to_id = reverse_fasta_dict(new_id_to_seq) for sequence in new_seq_to_id: if sequence in unique_seq_to_id: pass