def main0(): output_directory.mkdir(exist_ok=overwrite) print("running:\n" + allcmds) (output_directory / "cmds.txt").write_text(allcmds) pep_ion_minprob=get_pep_ion_minprob( Filter_option.all # Filter_option.by_2D_filtering ) cmd2 = " ".join(spectrast_cmd(pep_ion_minprob)) print(f"running:\n{cmd2}\n…") (output_directory / "cmds2.txt").write_text(cmd2) subprocess.run(spectrast_cmd(pep_ion_minprob), cwd=os_fspath(output_directory), check=True) print("take only proteins from philosopher’s proteins.fas…") filter_proteins(fasta, decoy_prefix) # swathwindowssetup_file_path.write_text(txt, "ascii") if is_DIA_Umpire_output: print("modifying splib file to combine Q[123] from DIA-umpire…") modify_splib() # %%time # subprocess.run(spectrast_cmds, shell=True, cwd=os_fspath(output_directory), check=True) subprocess.run(adjust_command(spectrast_cmds_part1), shell=True, cwd=os_fspath(output_directory), check=True) if align_with_iRT: cp = subprocess.run(adjust_command(spectrast_cmds_part2), shell=True, cwd=os_fspath(output_directory), check=not True) if cp.returncode != 0: shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib') else: shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib') subprocess.run(adjust_command(spectrast_cmds_part3), shell=True, cwd=os_fspath(output_directory), check=True)
def main0(): output_directory.mkdir(exist_ok=overwrite) print(f'''Spectral library building Commands to execute: {allcmds} {'~' * 69}''', flush=True) (output_directory / "cmds.txt").write_text(allcmds) pep_ion_minprob=get_pep_ion_minprob( Filter_option.all # Filter_option.by_2D_filtering , philosopher_filter_log_path.read_text() if skip_philosopher_filter else None ) # http://tools.proteomecenter.org/wiki/index.php?title=Software:SpectraST#User-defined_Modifications # http://tools.proteomecenter.org/wiki/index.php?title=Spectrast.usermods (output_directory / 'spectrast.usermods').write_text( r'''M|+16| C|+57| n|+42| C|119.004099|Cysteinyl c|-0.02|AmidatedCorrected ''') ''' c|-0.984016|Amidated c[c[17]]|-0.984016|Amidated ''' cmd2 = " ".join(spectrast_cmd(pep_ion_minprob)) print(f'Executing:{cmd2}\n') (output_directory / "cmds2.txt").write_text(cmd2) subprocess.run(spectrast_cmd(pep_ion_minprob), cwd=os_fspath(output_directory), check=True) # print("take only proteins from philosopher’s proteins.fas") filter_proteins(fasta, decoy_prefix) # swathwindowssetup_file_path.write_text(txt, "ascii") if is_DIA_Umpire_output: print("modifying splib file to combine Q[123] from DIA-umpire") modify_splib() print(f'Executing:{spectrast_cmds_part1}\n') subprocess.run(adjust_command(spectrast_cmds_part1), shell=True, cwd=os_fspath(output_directory), check=True) if align_with_iRT: print(f'Executing:{spectrast_cmds_part2}\n') cp = subprocess.run(adjust_command(spectrast_cmds_part2), shell=True, cwd=os_fspath(output_directory), check=not True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if cp.returncode != 0: print('Skipping iRT alignment\n') (output_directory / 'spectrast2spectrast_irt.log').write_bytes(cp.stdout) shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib') else: print(cp.stdout.decode()) print('iRT alignment done\n') else: shutil.move(output_directory / 'output_file_irt_con001.splib', output_directory / 'output_file_irt_con.splib') spectrast2tsv_additional_mods_path.write_text(spectrast2tsv_additional_mods_tsv_txt) print(f'Executing:{spectrast_cmds_part3}\n') subprocess.run(adjust_command(spectrast_cmds_part3), shell=True, cwd=os_fspath(output_directory), check=True)
def main_easypqp(): output_directory.mkdir(exist_ok=overwrite) if irt_choice is Irt_choice.iRT: irt_df.to_csv(irt_file, index=False, sep='\t', line_terminator='\n') elif irt_choice is Irt_choice.ciRT: shutil.copyfile(script_dir / 'hela_irtkit.tsv', irt_file) elif irt_choice is Irt_choice.userRT: shutil.copyfile(userRT_file, irt_file) print(f'''Spectral library building Commands to execute: {allcmds} {'~' * 69}''', flush=True) (output_directory / "cmds.txt").write_text(allcmds) subprocess.run([os.fspath(easypqp), '--version'], check=True) procs = [] for i, e in enumerate(easypqp_convert_cmds): while sum(p.poll() is None for p in procs) >= nproc: time.sleep(1) procs.append( subprocess.Popen(e, cwd=os_fspath(output_directory), stdout=open( output_directory / f'easypqp_convert_{i}.log', 'w'), stderr=subprocess.STDOUT)) print(f'Executing {e}') for p in procs: p.wait() for i, p in enumerate(procs): if p.returncode != 0: print("EasyPQP convert error BEGIN") try: print(open(output_directory / f'easypqp_convert_{i}.log').read(), end="") except OSError as e: print(e) print("EasyPQP convert error END") assert all(p.returncode == 0 for p in procs) try: subprocess.run(easypqp_library_cmd(use_iRT), cwd=os_fspath(output_directory), check=True) except subprocess.CalledProcessError: print( '''Library not generated, not enough peptides could be found for alignment. Please try using other options for alignment (e.g. ciRT if used other options)''' ) sys.exit()
def spectrast_cmd(prob): return [ os_fspath(SPECTRAST_PATH), "-c_BIN!", f"-cP{prob}", "-cIHCD", f"-cN{output_directory / 'input000'}"] + \ list(map(os_fspath, iproph_pep_xmls))
def table_from_pep_xml(infile: pathlib.Path): tree = lxml.etree.parse(os_fspath(infile)) spectrum_paths = tree.findall( "/{http://regis-web.systemsbiology.net/pepXML}msms_run_summary") try: (msms_file, ) = set( pathlib.Path(spectrum_path.get("base_name")).with_suffix( ".mzXML").resolve(strict=True) for spectrum_path in spectrum_paths) except FileNotFoundError as e: [spectrum_path] = tree.findall( # "/{http://regis-web.systemsbiology.net/pepXML}msms_pipeline_analysis" "/{http://regis-web.systemsbiology.net/pepXML}msms_run_summary" "/{http://regis-web.systemsbiology.net/pepXML}search_summary" "/{*}parameter[@name='spectrum, path']") msms_file = pathlib.Path( spectrum_path.get("value")).resolve(strict=True) import re infile.write_text( re.compile('base_name="(.+?)" ').sub(f'base_name="{msms_file}" ', infile.read_text('utf-8'))) scannum_to_rt = get_scannum_to_rt(msms_file) gen = (get_pep(ee, scannum_to_rt) for ee in tree.findall("/{*}msms_run_summary/{*}spectrum_query")) p = set((FullUniModPeptideName, PrecursorCharge, rt) for probablity, FullUniModPeptideName, PrecursorCharge, rt in gen if probablity > PEPTIDE_PROB) colnames = ["FullUniModPeptideName", "PrecursorCharge", "Tr_recalibrated"] return pd.DataFrame({colname: e for colname, *e in zip(colnames, *p)})
def get_prot_group_infos(p: pathlib.Path): import lxml.etree root = lxml.etree.parse(os_fspath(p)).getroot() def number_id_peps__pep_prob_sum(prot): peps = prot.findall("{*}peptide") return (len(peps), sum(float(pep.get("nsp_adjusted_probability")) for pep in peps)) return [(prot.get('protein_name'), [e.get('protein_name') for e in prot.findall("{*}indistinguishable_protein")], number_id_peps__pep_prob_sum(prot)) for prot in root.iterfind(".//{*}protein_group/{*}protein")]
def get_pep_ion_minprob(opt: Filter_option, philosopher_filter_log: str = None): if philosopher_filter_log is not None: return get_pep_ion_minprob_from_log(opt, philosopher_filter_log) outl=[] f=sys.stdout.buffer ### get 2D FDR if sys.platform=='linux': with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stderr=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \ phi_log.open('wb') as f2: for line in proc1.stderr: f.write(line) f.flush() f2.write(line) f2.flush() outl.append(line) if sys.platform=='win32': with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stdout=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \ phi_log.open('wb') as f2: for line in proc1.stdout: f.write(line) f.flush() f2.write(line) f2.flush() outl.append(line) if use_philosopher_fo and proc1.returncode == 1: a = (output_directory/'.meta'/'pep_pro_mappings.tsv').read_text() l = [e.split('\t') for e in re.compile('(?=sp\\|)').split(a)] d = {ee2: e for e, *e2 in l for ee2 in e2} ##create dummy fasta with (output_directory / 'proteins.fas').open('x') as f: for prot in sorted(set(d.values())): f.write(f'>{prot}\nDUMMY\n') # create dummy psm.tsv (output_directory / 'psm.tsv').write_text( 'Peptide\tProtein\n' + '\n'.join(f'{ee2}\t{e}' for e, *e2 in l for ee2 in e2) ) else: assert proc1.returncode == 0, [proc1.args, proc1.returncode] ## filter original fasta file subprocess.run(phi_cmd_part2, shell=True, stderr=subprocess.STDOUT, cwd=os_fspath(output_directory), check=True) out = b"".join(filter(lambda line: not line.startswith(b"+"), outl)) outtxt = out.decode("ascii") res2 = [float(e) for e in re.compile(' Ions.+threshold.*?=([0-9.]+)').findall(outtxt)] return res2[opt.value] log_kvs = [list(g) for _, g in itertools.groupby(shlex.split(outtxt), key=lambda x: x.startswith("time="))] logrecords = [dict(e.split("=", 1) for e in a + b) for a, b in zip(log_kvs[::2], log_kvs[1::2])] res2 = [float(e["threshold"]) for e in logrecords if e["msg"].endswith("Ions")] assert len(res2) == 2, res2 return res2[opt.value]
def get_pep_ion_minprob(opt: Filter_option): outl = [] f = sys.stdout.buffer ### get 2D FDR if sys.platform == 'linux': with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stderr=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \ phi_log.open('wb') as f2: for line in proc1.stderr: f.write(line) f.flush() f2.write(line) f2.flush() outl.append(line) if sys.platform == 'win32': with subprocess.Popen(adjust_command(phi_cmd_part1), shell=True, stdout=subprocess.PIPE, cwd=os_fspath(output_directory)) as proc1, \ phi_log.open('wb') as f2: for line in proc1.stdout: f.write(line) f.flush() f2.write(line) f2.flush() outl.append(line) assert proc1.returncode == 0, [proc1.args, proc1.returncode] ## filter original fasta file subprocess.run(phi_cmd_part2, shell=True, stderr=subprocess.STDOUT, cwd=os_fspath(output_directory), check=True) out = b"".join(filter(lambda line: not line.startswith(b"+"), outl)) outtxt = out.decode("ascii") res2 = [ float(e) for e in re.compile(' Ions.+threshold.*?=([0-9.]+)').findall(outtxt) ] return res2[opt.value] log_kvs = [ list(g) for _, g in itertools.groupby(shlex.split(outtxt), key=lambda x: x.startswith("time=")) ] logrecords = [ dict(e.split("=", 1) for e in a + b) for a, b in zip(log_kvs[::2], log_kvs[1::2]) ] res2 = [ float(e["threshold"]) for e in logrecords if e["msg"].endswith("Ions") ] assert len(res2) == 2, res2 return res2[opt.value]
def get_scannum_to_rt(msms_file: pathlib.Path): assert msms_file.suffix.casefold() == '.mzXML'.casefold() from xml.dom import pulldom doc = pulldom.parse(os_fspath(msms_file)) scannums, rts = [], [] for event, node in doc: if event is pulldom.START_ELEMENT and node.tagName == "scan": scannum = int(node.getAttribute("num")) rt_str = node.getAttribute("retentionTime") assert rt_str.startswith("PT") and rt_str.endswith("S") # scannum_to_rt.append((scannum, float(rt_str[2:-1])) scannums.append(scannum) rts.append(float(rt_str[2:-1])) doc.stream.close() scannum_to_rt = np.empty((max(scannums) + 1, ), dtype=np.float32) scannum_to_rt.fill(np.nan) for scannum, rt in zip(scannums, rts): scannum_to_rt[scannum] = rt return scannum_to_rt
return m[1] import pandas as pd, pathlib t = pd.read_table("output_irt_con.tsv") philosopher_psm_tsv = pd.read_table('psm.tsv') pep_to_razor_prot = { pep: razor_prot for _, pep, razor_prot in philosopher_psm_tsv[["Peptide", "Protein" ]].itertuples() } t["razor_Protein"] = t["PeptideSequence"].map(pep_to_razor_prot.get) pep_init_prob = get_pep_init_prob(p) pathlib.Path('con_lib_not_in_psm_tsv.tsv').write_text( t[t["razor_Protein"].isnull()].assign( init_prob=t["PeptideSequence"].map(pep_init_prob.get).map( lambda x: "" if x is None else ','.join(x))).to_csv( sep='\t', index=False).replace('(UniMod:5)', '(UniMod:1)')) fout = pathlib.Path('con_lib.tsv') print(f'writing {fout.resolve()}') fout.write_text(t[t["razor_Protein"].notnull()].to_csv( sep='\t', index=False).replace('(UniMod:5)', '(UniMod:1)')) main0() os.chdir(os_fspath(output_directory)) edit_raw_con_lib() os.chdir(CWD) # if __name__=='__main__': # main()
def write_RT_aligned_pepxml(iproph_pep_xml, rt_aligned_pepxml, reg_obj): # reg_obj_max_idx = len(reg_obj) - 1 # def repl(x): # idx = round(float(x.group()) * 10) # return str(reg_obj[min(idx,reg_obj_max_idx)]) if reg_obj == "ref run": with rt_aligned_pepxml.open("wt") as newf, \ iproph_pep_xml.open("rt") as origf: import shutil shutil.copyfileobj(origf, newf) def repl(x): return str(predict(float(x.group()), reg_obj)) ## spectrast will fail reading mzXML file if the replacement is not of the same length return format(predict(float(x.group()), reg_obj), ".6f")[:len(x.group())] return format(predict(float(x.group()), reg_obj), ".2f") import pathlib, re, filecmp def get_msms_from_pep_xml(p): t = p.read_text() paths = [ pathlib.Path(p) for p in re.compile( '<msms_run_summary base_name="(.+?)"').findall(t) ] paths1 = [(p.parent / p.stem).with_suffix('.mzXML') for p in paths] # (msms_file,) = set(filter(pathlib.Path.exists, paths1)) msms_files = set(filter(pathlib.Path.exists, paths1)) msms_file = next(iter(msms_files)) assert all([filecmp.cmp(f, msms_file) for f in msms_files]), msms_files return msms_file msms_file = get_msms_from_pep_xml(iproph_pep_xml) recomp = re.compile('(?<=retention_time_sec=")(.+?)(?=")') recomp_base_name = re.compile('base_name="(.+?)"') with rt_aligned_pepxml.open("wt") as newf, \ iproph_pep_xml.open("rt") as origf: recomp2 = re.compile('(?<=retentionTime="PT)(.+?)(?=S")') new_msms_file = rt_aligned_pepxml.parent / msms_file.name with msms_file.open("rt") as msms_file_obj, \ new_msms_file.open("wt") as new_msms_file_obj: total_repl = 0 for line_1 in msms_file_obj: line_new, count = recomp2.subn(repl, line_1) assert count in (0, 1) total_repl += count new_msms_file_obj.write(line_new) assert total_repl > 0 proc = subprocess.Popen( [os_fspath(TPP_BIN / 'indexmzXML'), os_fspath(new_msms_file)], stdout=subprocess.DEVNULL) for line in origf: if '<msms_run_summary' in line: newline, count = recomp_base_name.subn( f'''base_name="{msms_file.with_suffix('')}"''', line, 1) assert count == 1 newf.write(newline) else: newf.write(recomp.sub(repl, line)) proc.wait() assert proc.returncode == 0, [proc.args, proc.returncode] new_msms_file.unlink() (new_msms_file.parent / (new_msms_file.name + ".new")).rename(new_msms_file)
rtalign_data_directory.mkdir(parents=True, exist_ok=False) rt_dicts_file = rtalign_data_directory / "RT_dicts.pickle" PEPTIDE_PROB = 0.9 abs_paths = [ None if e is None else e.resolve() for e in [ rtalign_data_directory, dia_pepxml_directory, dda_pepxml_directory, TPP_BIN / 'indexmzXML' ] ] print((TPP_BIN / 'indexmzXML').resolve(strict=True)) print("\n".join(str(e) for e in abs_paths)) CWD = os.getcwd() os.chdir(os_fspath(rtalign_data_directory)) # dia_pepxml_directory = rtalign_data_directory / "iproph" assert dia_pepxml_directory.exists() pep_xml_rt_aligned_dir = rtalign_data_directory / "RT_aligned" if has_DDA is True: assert dda_pepxml_directory.exists() dda_iproph_pep_xmls = sorted( fn.resolve() for fn in dda_pepxml_directory.glob("*.pep.xml")) assert len(dda_iproph_pep_xmls) > 0 dda_pep_xml_rt_aligned_dir = rtalign_data_directory / "RT_aligned_DDA" # rt_align_dir = data_directory / "RTalign" iproph_pep_xmls = sorted(fn.resolve() for fn in dia_pepxml_directory.glob("*.pep.xml"))
import pandas as pd, numpy as np, pathlib import pickle if not True: sys.argv = ["%(prog)s", "./workdir/libgen/combined_prots", "./workdir/iproph", "/data/dattam/PROJECTS/CoreFacility/PDLC/dda-lib-atcc-mm-R1/workdir/iproph", "/data/teog/tpp5/bin/"] sys.argv = ["%(prog)s", "./DIA/workdir/libgen/combined_prots", "./DIA/workdir/iproph", "./DDA/workdir/iproph", "/data/teog/tpp5/bin/"] has_DDA = sys.argv[3] != "none" combined_prot_data_directory = str_to_path(sys.argv[1]) dia_pepxml_directory = str_to_path(sys.argv[2]) dda_pepxml_directory = str_to_path(sys.argv[3]) if has_DDA else None TPP_BIN = str_to_path(sys.argv[4]) dia_pep_xmls = list(map(os_fspath, dia_pepxml_directory.glob("*.iproph.pep.xml"))) dda_pep_xmls = list(map(os_fspath, dda_pepxml_directory.glob("*.iproph.pep.xml"))) raise_if_not(dia_pepxml_directory.exists(), "nonexistant DIA pep xml directory") raise_if_not(dda_pepxml_directory.exists(), "nonexistant DDA pep xml directory") raise_if_not(len(dia_pep_xmls) > 0, "no DIA pep xml found") raise_if_not(len(dda_pep_xmls) > 0, "no DDA pep xml found") combined_prot_data_directory.mkdir(exist_ok=True) subprocess.run([os_fspath(TPP_BIN / "ProteinProphet")] + dia_pep_xmls + dda_pep_xmls + [os_fspath(combined_prot_data_directory / "interact.prot.xml")] + ["IPROPHET", "MINPROB0.9"], cwd=combined_prot_data_directory)