def test_identify_identified(self): parser = clustering_parser.ClusteringParser(self.testfile) analyser = id_transferer.IdTransferer(add_to_identified=True, add_to_unidentified=False) for cluster in parser: analyser.process_cluster(cluster) self.assertEqual(3151, len(analyser.identification_references))
def extract_clusters(cluster_ids, clustering_file): parser = clustering_parser.ClusteringParser(clustering_file) clusters = list() for cluster in parser: if cluster.id in cluster_ids: clusters.append(cluster) if len(clusters) == len(cluster_ids): break return clusters
def test_wrong_spec_counts(self): parser = clustering_parser.ClusteringParser(self.testfile3) all_clusters = list() for c in parser: all_clusters.append(c) self.assertEqual(27, len(all_clusters)) self.assertEqual(13, all_clusters[26].n_spectra) self.assertEqual(5, all_clusters[26].identified_spectra)
def test_parse_clustering_file(self): parser = clustering_parser.ClusteringParser(self.testfile) n_clusters = 0 seen_clusters = set() for cluster in parser: n_clusters += 1 if cluster.id in seen_clusters: self.fail(cluster.id + " was encountered twice.") seen_clusters.add(cluster.id) if n_clusters == 1: self.assertEqual("1cc813a1-4e75-4c1d-99aa-752312fbe554", cluster.id) self.assertEqual(359.155, cluster.precursor_mz) self.assertEqual(2, len(cluster.get_spectra())) self.assertEqual(1, len(cluster.get_spectra()[0].psms)) self.assertEqual( "RPHFFFPK", cluster.get_spectra() [0].psms.__iter__().__next__().sequence) self.assertEqual(1, len(cluster.max_sequences)) self.assertEqual("RPHFFFPK", cluster.max_sequences[0]) self.assertEqual(1, cluster.max_ratio) self.assertEqual(1, cluster.max_il_ratio) for spectrum in cluster.get_spectra(): for psm in spectrum.psms: if len(psm.ptms) > 0: self.assertEqual("R[MOD:1234]PHFFFPK", str(psm)) if n_clusters == 2: self.assertEqual("9a582e74-e8b1-451d-a007-cadc362aa2ce", cluster.id) self.assertEqual(2 / 3, cluster.max_ratio) self.assertEqual(2 / 3, cluster.max_il_ratio) self.assertEqual(1, len(cluster.max_sequences)) self.assertEqual("MEGIGLK", cluster.max_sequences[0]) # make sure the last cluster is read correctly if n_clusters == 838: self.assertEqual("25ed3015-f2d8-4df1-ac96-c23076c96bfe", cluster.id) self.assertEqual(1, cluster.max_ratio) self.assertEqual(1, cluster.max_il_ratio) self.assertEqual(1, len(cluster.max_sequences)) self.assertEqual("MQEAMTQEVSDVFSDTTTPIK", cluster.max_sequences[0]) self.assertEqual(838, n_clusters)
def main(): """ Primary entry function for the CLI. :return: """ arguments = docopt(__doc__, version='cluster_filter.py 1.0 BETA') # make sure the input file exists if not os.path.isfile(arguments['--input']): print("Error: Cannot find input file '" + arguments["--input"] + "'") sys.exit(1) # make sure the output file does not exist if os.path.isfile(arguments["--output"]): print("Error: Output file exists '" + arguments["--output"] + "'") sys.exit(1) project_ids = None cluster_ids = None if "--project_ids" in arguments and arguments["--project_ids"] is not None: project_ids = list() with open(arguments["--project_ids"], "r") as IN: for line in IN: project_ids.append(line.strip()) if "--cluster_ids" in arguments and arguments["--cluster_ids"] is not None: cluster_ids = list() with open(arguments["--cluster_ids"], "r") as IN: for line in IN: cluster_ids.append(line.strip()) with open(arguments["--output"] + ".part", "w") as OUT: # create the id transferer based on the settings analyser = create_analyser(arguments, OUT) # process all clusters parser = clustering_parser.ClusteringParser(arguments["--input"]) print("Parsing input .clustering file...") for cluster in parser: # filter based on cluster ids if set if cluster_ids is not None: if cluster.id not in cluster_ids: continue if project_ids is not None: analyser.remove_spectra_by_project(cluster, project_ids) analyser.process_cluster(cluster) os.rename(arguments["--output"] + ".part", arguments["--output"]) print("Results written to " + arguments["--output"])
def main(): """ Primary entry function for the CLI. :return: """ arguments = docopt(__doc__, version='cluster_parameter_extractor 1.0 BETA') input_file = arguments['--input'] output_file = arguments["--output"] process_synthetic = arguments["--synthetic_peptides"] # make sure the input file exists if not os.path.isfile(input_file): print("Error: Cannot find input file '" + input_file + "'") sys.exit(1) # make sure the output file does not exist if os.path.isfile(output_file): print("Error: Output file exists '" + output_file + "'") sys.exit(1) with open(output_file, "w") as OUT: # write the header OUT.write( "id\tprecursor_mz\tav_charge\tsize\tidentified_spec_count\tunidentified_spec_count\t" "max_ratio\tmax_il_ratio\tprecursor_mz_range\tsequences\t" "max_sequence\tmax_sequence_count\tmax_sequence_mods\t" "second_max_sequence\tsecond_max_sequence_count\tsecond_max_sequence_mods\tn_input_files\t" "max_consensus_peak_rel_tic\tmax_consensus_peak_mz") if process_synthetic: OUT.write("\tsynth_count\tsynth_ratio\tsynth_max_sequence") OUT.write("\n") # process the file parser = clustering_parser.ClusteringParser(input_file) for cluster in parser: cluster_line = process_cluster(cluster) OUT.write(cluster_line) # process synthetic peptides if process_synthetic: synth_line = process_synthetic_peptides(cluster) OUT.write("\t" + synth_line) OUT.write("\n") print("Results written to " + output_file)
def testClusterAsFeatures(self): parser = clustering_parser.ClusteringParser(self.testfile) analyser = cluster_features.ClusterAsFeatures( sample_name_extractor=ClusterAsFeaturesTest.pride_project_extractor) for cluster in parser: analyser.process_cluster(cluster) self.assertEqual(838, len(analyser.features)) self.assertEqual(1, len(analyser.features[0])) self.assertEqual(2, analyser.features[0]["PRD000001"]) self.assertEqual(1, len(analyser.samples)) self.assertTrue("PRD000001" in analyser.samples)
def test_retention_time(self): """ Test parsing .clustering files that contain the retention time as additional parameter """ test_file = os.path.join(os.path.dirname(__file__), "testfiles", "retention_time_test.clustering") parser = clustering_parser.ClusteringParser(test_file) for cluster in parser: self.assertIsNotNone(cluster) # make sure every spectrum has the retention time property for spec in cluster.get_spectra(): self.assertIsNotNone(spec.get_property("RT"))
def test_identifiy_all(self): parser = clustering_parser.ClusteringParser(self.testfile) analyser = id_transferer.IdTransferer(True, True) for cluster in parser: analyser.process_cluster(cluster) self.assertEqual(3149, len(analyser.identification_references)) self.assertEqual(1, len(analyser.identification_references[11].psms)) self.assertEqual("HQGVMVGMGQK", analyser.identification_references[11].psms[0].sequence) self.assertEqual("/home/jg/Projects/ebi-pride/pride-cluster-2/chimeric-spectra-generator/src/test/resources/PRD000001.st.id.mgf", analyser.identification_references[11].filename) ref4 = analyser.identification_references[4] self.assertEqual(1, len(ref4.psms)) self.assertEqual("MEGIGLK", ref4.psms[0].sequence)
def load_spectra_to_cluster(result_file: str, before_cluster_id: str = None): """ Creates a dict holding the spectra ids as keys and the cluster ids the spectrum belongs to as value. :param result_file: Path to the .clustering file to process :param before_cluster_id: If set this string is prepended to every cluster id. :return: A dict with the spectrum id as key and the cluster id as value """ parser = clustering_parser.ClusteringParser(result_file) spec_to_cluster = dict() for cluster in parser: for spectrum in cluster.get_spectra(): if before_cluster_id is not None: spec_to_cluster[ spectrum.get_id()] = before_cluster_id + cluster.id else: spec_to_cluster[spectrum.get_id()] = cluster.id return spec_to_cluster
def main(): """ Primary entry function for the CLI. :return: """ arguments = docopt(__doc__, version='id_transferer_cli 1.0 BETA') # make sure the input file exists if not os.path.isfile(arguments['--input']): print("Error: Cannot find input file '" + arguments["--input"] + "'") sys.exit(1) # make sure the output file does not exist if os.path.isfile(arguments["--output"]): print("Error: Output file exists '" + arguments["--output"] + "'") sys.exit(1) with open(arguments["--output"], "w") as OUT: # create the id transferer based on the settings analyser = create_analyser(arguments, OUT) # process all clusters parser = clustering_parser.ClusteringParser(arguments["--input"]) print("Parsing input .clustering file...", end="", flush=True) processed_clusters = 0 for cluster in parser: analyser.process_cluster(cluster) processed_clusters += 1 if processed_clusters == 1000: print(".", end="", flush=True) processed_clusters = 0 # add the header to the output file print("Adding header line...") analyser.add_resultfile_header(arguments["--output"]) print("Results written to " + arguments["--output"])
def main(): """ Primary entry function for the CLI. """ arguments = docopt(__doc__, version='id_transferer_cli 1.0 BETA') # make sure the input file exists input_file = arguments['--input'] if not os.path.isfile(input_file): print("Error: Cannot find input file '" + input_file + "'") sys.exit(1) # make sure the output file does not exist output_file = arguments["--output"] if os.path.isfile(output_file): print("Error: Output file exists '" + output_file + "'") sys.exit(1) fasta_file = arguments["--fasta"] if fasta_file is not None: if not os.path.isfile(fasta_file): print("Error: Cannot find FASTA file '" + fasta_file + "'") sys.exit(1) # create the id transferer based on the settings analyser = create_analyser(arguments) # process all clusters parser = clustering_parser.ClusteringParser(input_file) print("Parsing input .clustering file...") for cluster in parser: analyser.process_cluster(cluster) # perform protein inference if fasta_file is not None: print("Doing protein inference...") all_peptides = set() for id_ref in analyser.identification_references: for psm in id_ref.psms: all_peptides.add(psm.sequence) peptide_mappings = peptide_mapping.the_magic_mapping_function( all_peptides, fasta_file) else: peptide_mappings = None # create the output file if arguments["--moff_compatible"]: write_moff_results( identification_references=analyser.identification_references, peptide_mappings=peptide_mappings, output_filename=output_file) else: write_results( identification_references=analyser.identification_references, peptide_mappings=peptide_mappings, output_filename=output_file) print("Results written to " + output_file)
def test_psi_clustering(self): parser = clustering_parser.ClusteringParser(self.testfile2) for cluster in parser: self.assertEqual(2, len(cluster.get_spectra()))