def parse(seq_path): """ Figures out the sequence filetype automatically. Only distinguishes between uniprot and uniref now. """ generic.quit_if_missing(seq_path) is_uniref = False is_valid = False with open(seq_path) as file: for line in file: if not line.startswith(">"): continue is_valid = True if line[1:].startswith("UniRef"): is_uniref = True break break if not is_valid: logging.error(f"Input Seq-file in {seq_path} is invalid, no header " f"lines with > found.") raise Exception if is_uniref: pdb_seq_map = parse_uniref(seq_path) else: pdb_seq_map = parse_uniprot(seq_path) return pdb_seq_map
def find_motifs_mast(pname_cid_path, seq_file, ref_meme_txt, motif_len, output, meme_folder=paths.MEME_MAST_FOLDER): """ :param pname_cid_path: paths.PNAME_CID :param seq_file: paths.FULL_SEQS :param ref_meme_txt: paths.REF_MEME_TXT :param motif_len: 13 :param output: paths.MOTIF_POS """ assert motif_len >= 1 assert isinstance(motif_len, int) generic.quit_if_missing(pname_cid_path) with open(pname_cid_path, 'rb') as file: pname_cid_map = pickle.load(file) _test_seq_cid_map(pname_cid_map) motif_pos = motif_finder.find_mast(pname_cid_map, seq_file, ref_meme_txt, motif_len, meme_folder=meme_folder) generic.warn_if_exist(output) with open(output, 'wb') as file: pickle.dump(motif_pos, file, -1)
def build(seq_path, output): generic.quit_if_missing(seq_path) generic.warn_if_exist(output) if os.path.isfile(output): os.remove(output) # shutil.move(output, paths.TRASH) command = f"{paths.FASTA_2_MARKOV_EXEC} -protein < {seq_path} > {output}" subprocess.run(command, shell=True) generic.quit_if_missing(output) # with open(seq_path, 'r') as file: # lines = file.readlines() # counter_obj = Counter() # for line in lines: # if line.startswith(">"): # pass # counter_obj.update(line.strip()) # total_count = sum(counter_obj.values()) # alphabets = sorted(generic.AA3_to_AA1.values()) # print(counter_obj) # with open(output, 'w') as file: # for letter in alphabets: # percentage = "%.5f" % (counter_obj[letter] / total_count) # file.write(percentage + " ") # from tests.src import paths_test # build(paths_test.UNIPROT_SEQ, "./composition.txt")
def parse_extract_ioncom(input_file, pname_cid_path): """ :param input_file: paths.IONCOM_EXTRACT :param pname_cid_path: paths.PNAME_CID """ generic.quit_if_missing(input_file) pname_cid_map = extract_parser.parse_ioncom(input_file) _test_seq_cid_map(pname_cid_map) generic.warn_if_exist(pname_cid_path) with open(pname_cid_path, 'wb') as file: pickle.dump(pname_cid_map, file, -1)
def run_mast(meme_txt, fasta_filename, mast_output, mast_exec=paths.MAST_EXEC): generic.quit_if_missing(meme_txt) generic.quit_if_missing(fasta_filename) generic.warn_if_exist(mast_output, filetype='folder') command = f"{mast_exec} -oc {mast_output} -mt 0.0001 {meme_txt} " \ f"{fasta_filename}" return_code = subprocess.run(command, shell=True).returncode if return_code != 0: logging.error("run_mast() failed.") logging.error(f"Command: <{command}>") raise Exception
def trim_pnames_based_on_pdb(pname_cid_path, pdb_folder=paths.PDB_FOLDER): """ :param pname_cid_path: paths.PNAME_CID """ with open(pname_cid_path, 'rb') as file: pname_cid_map = pickle.load(file) _test_seq_cid_map(pname_cid_map) download_pdb_files.trim_pname_cid(pname_cid_map, pdb_folder) _test_seq_cid_map(pname_cid_map) with open(pname_cid_path, 'wb') as file: pickle.dump(pname_cid_map, file, -1) generic.quit_if_missing(pname_cid_path)
def parse_extract_prosite(input_file, pname_cid_path): """ :param input_file: paths.PROSITE_EXTRACT :param pname_cid_path: paths.PNAME_CID """ generic.quit_if_missing(input_file) pname_cid_map = extract_parser.parse_prosite(input_file, prosite_pdb_list.pdb_list) _test_seq_cid_map(pname_cid_map) generic.warn_if_exist(pname_cid_path) with open(pname_cid_path, 'wb') as file: pickle.dump(pname_cid_map, file, -1)
def download_pdb(pname_cid_path, pdb_folder=paths.PDB_FOLDER): """ :param pname_cid_path: paths.PNAME_CID """ generic.quit_if_missing(pname_cid_path) with open(pname_cid_path, 'rb') as file: pname_cid_map = pickle.load(file) _test_seq_cid_map(pname_cid_map) if not os.path.isdir(pdb_folder): logging.warning(f"PDB_folder in <{pdb_folder}> not found. Downloading " f"from scratch takes a while.") os.mkdir(pdb_folder) download_pdb_files.download(pname_cid_map, pdb_folder)
def create_seq(pname_cid_path, seq_path, pdb_folder=paths.PDB_FOLDER): """ :param pname_cid_path: paths.PNAME_CID :param seq_path: paths.FULL_SEQS """ generic.quit_if_missing(pname_cid_path) with open(pname_cid_path, 'rb') as file: pname_cid_map = pickle.load(file) _test_seq_cid_map(pname_cid_map) seqs = create_seq_file.extract_sequence(pdb_folder, pname_cid_map, AA3_to_AA1=generic.AA3_to_AA1) with open(seq_path, 'w') as file: file.writelines(seqs) create_seq_file.test_fasta_match_pdb(seq_path, pdb_folder, pname_cid_map, generic.AA3_to_AA1)
def run_meme_single(fasta_filename, motif_len, meme_output, num_p=1, meme_exec=paths.MEME_EXEC): assert motif_len >= 1 assert isinstance(motif_len, int) generic.quit_if_missing(fasta_filename) generic.warn_if_exist(meme_output, filetype='folder') command = f"{meme_exec} -w {motif_len} -p {num_p} -protein -nmotifs 1 " \ f"-mod oops -oc {meme_output} {fasta_filename}" return_code = subprocess.run(command, shell=True).returncode if return_code != 0: logging.error("run_meme_single() failed.") logging.error(f"Command: <{command}>") raise Exception
def _get_id_seq_map(seq_path, line_extractor): generic.quit_if_missing(seq_path) id_seq_map = dict() with open(seq_path, 'r') as file: current_seq = [] for line in file: if line.startswith(">"): desired_id = line_extractor(line) if current_seq: seq = "".join(current_seq) id_seq_map[desired_id] = seq current_seq = [] else: current_seq.append(line.strip()) if current_seq: id_seq_map[desired_id] = seq return id_seq_map
def parse_raw(seq_path): """ Only uniprot """ generic.quit_if_missing(seq_path) id_seq_map = dict() with open(seq_path, 'r') as file: current_seq = [] for line in file: if line.startswith(">"): if current_seq: seq = "".join(current_seq) id_seq_map[desired_id] = seq current_seq = [] desired_id = line.strip().split("|")[1] else: current_seq.append(line.strip()) if current_seq: id_seq_map[desired_id] = seq return id_seq_map pass
def run_prosite_meme(extract_path, motif_len, output, num_p=7, pdb_folder=paths.PDB_FOLDER, storage_path=None): """ :param extract_path: paths.PROSITE_EXTRACT :param motif_len: 13 :param output: paths.PID_PDB_MAP """ '' generic.quit_if_missing(extract_path) generic.warn_if_exist(output) assert isinstance(num_p, int) assert num_p >= 1 if storage_path is None: pname_cid_path = paths.PNAME_CID seq_path = paths.FULL_SEQS meme_folder = paths.MEME_MAST_FOLDER else: generic.quit_if_missing(storage_path, filetype='folder') pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl') seq_path = os.path.join(storage_path, 'seqs.fasta') meme_folder = os.path.join(storage_path, 'meme_folder') parse_extract_prosite(extract_path, pname_cid_path) download_pdb(pname_cid_path, pdb_folder) trim_pnames_based_on_pdb(pname_cid_path, pdb_folder) create_seq(pname_cid_path, seq_path, pdb_folder) filter_seq_file(seq_path, threshold=31) find_motifs_meme(pname_cid_path, seq_path, motif_len, output, meme_folder, num_p) if storage_path is None: shutil.move(pname_cid_path, paths.TRASH) shutil.move(seq_path, paths.TRASH) shutil.move(meme_folder, paths.TRASH)
def run_prosite_mast(extract_path, motif_len, ref_meme_txt, output, pdb_folder=paths.PDB_FOLDER, storage_path=None): """ :param extract_path: paths.IONCOM_EXTRACT :param motif_len: 13 :param ref_meme_txt: paths.REF_MEME_TXT :param output: paths.PID_PDB_MAP :param pdb_folder: :param storage_path: :return: """ generic.quit_if_missing(extract_path) generic.quit_if_missing(ref_meme_txt) generic.warn_if_exist(output) if storage_path is None: pname_cid_path = paths.PNAME_CID seq_path = paths.FULL_SEQS meme_folder = paths.MEME_MAST_FOLDER else: generic.quit_if_missing(storage_path, filetype='folder') pname_cid_path = os.path.join(storage_path, 'pname_cid_map.pkl') seq_path = os.path.join(storage_path, 'seqs.fasta') meme_folder = os.path.join(storage_path, 'meme_folder') parse_extract_prosite(extract_path, pname_cid_path) download_pdb(pname_cid_path, pdb_folder) trim_pnames_based_on_pdb(pname_cid_path, pdb_folder) create_seq(pname_cid_path, seq_path, pdb_folder) filter_seq_file(seq_path, threshold=31) find_motifs_mast(pname_cid_path, seq_path, ref_meme_txt, motif_len, output, meme_folder) if storage_path is None: shutil.move(pname_cid_path, paths.TRASH) shutil.move(seq_path, paths.TRASH) shutil.move(meme_folder, paths.TRASH)
def run_prosite_aligned(seq_file, aligned_seq_file, output, storage_path=None): """ We start off with a seq_file, aligned_seq_file. We derive matrix from aligned_seq_file We derive composition from seq_file We put both together into a meme.txt We take seq_file, and extract from it the relevant .pdb files and the corresponding cid. So acc+seq => pdb+cid We therefore select the seq files for which a corresponding .pdb+cid exists, and output it into a cropped_seqfile We run mast on this cropped_seqfile, to obtain motif_pos. Finally we output this motif_pos using the pdb+cid from before. Desired intermediate files: meme.txt from aligned_seq acc=>pdb+cid map cropped_seqfile.fasta acc=>motif_pos pdb+cid=>motif_pos """ generic.quit_if_missing(seq_file) generic.quit_if_missing(aligned_seq_file) generic.warn_if_exist(output) if storage_path is None: composition_file = paths.TMP_FILE_TEMPLATE.format('composition.txt') meme_txt = paths.TMP_FILE_TEMPLATE.format('meme_from_aligned.txt') meme_folder = paths.MEME_MAST_FOLDER cropped_seq_file = paths.TMP_FILE_TEMPLATE.format('cropped_seqs.fasta') else: generic.quit_if_missing(storage_path, filetype='folder') composition_file = os.path.join(storage_path, 'composition.txt') meme_txt = os.path.join(storage_path, 'meme_from_aligned.txt') meme_folder = os.path.join(storage_path, 'meme_mast_folder') cropped_seq_file = os.path.join(storage_path, 'cropped_seqs.fasta') if os.path.isfile(composition_file): os.remove(composition_file) build_composition.build(seq_file, composition_file) build_meme_from_aligned.build(aligned_seq_file, meme_txt, composition_file) acc_seq_map = get_pname_seq.parse_raw(seq_file) acc_ids = list(acc_seq_map.keys()) acc_pdb_map = uniprot_id_converter.convert("ACC", "PDB_ID", acc_ids) pdb_seq_map = dict() # because pdb => acc mapping may not be 1-1, we retain the original maps mapped_pdb_acc = dict() for acc_id, seq in acc_seq_map.items(): if acc_id in acc_pdb_map: pdb_id = acc_pdb_map[acc_id] mapped_pdb_acc[pdb_id] = acc_id pdb_seq_map[pdb_id] = seq pdb_cid_map = find_cid_from_pname.find(pdb_seq_map) acc_pdb_cid_map = { mapped_pdb_acc[pdb]: (pdb, cid) for pdb, cid in pdb_cid_map.items() } cropped_acc_list = list(acc_pdb_cid_map.keys()) keep_only_acc(cropped_acc_list, seq_file, cropped_seq_file) meme_interface.run_mast(meme_txt, cropped_seq_file, meme_folder) mast_txt_path = os.path.join(meme_folder, 'mast.txt') acc_motif_map = meme_interface.extract_motifs_mast_uniprot( mast_txt_path, 14) acc_motif_map = motif_finder._delete_gapped_motifs_uniprot( acc_motif_map, cropped_seq_file) pdb_motif_pos = defaultdict(dict) for acc, motif_pos in acc_motif_map.items(): pdb_id, cid = acc_pdb_cid_map[acc] pdb_motif_pos[pdb_id]['sno_markers'] = motif_pos pdb_motif_pos[pdb_id]['cid'] = cid with open(output, 'wb') as file: pickle.dump(pdb_motif_pos, file, -1) if storage_path is None: shutil.move(composition_file, paths.TRASH) shutil.move(meme_txt, paths.TRASH) shutil.move(meme_folder, paths.TRASH) shutil.move(cropped_seq_file, paths.TRASH) return
def filter_seq_file(seq_path, threshold=50): """ :param seq_path: paths.FULL_SEQS """ generic.quit_if_missing(seq_path) filter_seqs.delete_short_seqs(seq_path, threshold)