def init(query_size=1000, min_identical_matches=1, spec2vec_decimal_places=2, folder_name='C:\\Users\\Gosia\\Desktop'): global documents_query global documents_lib global spectrums_lib global spectrums_query spec_with_precursor = load_from_json( r'C:\Users\Gosia\Desktop\gnps_positive_ionmode_cleaned_by_matchms_and_lookups.json' ) # apply post processing steps to the data spec_with_precursor = [ post_process(s) for s in spec_with_precursor if s.metadata.get('inchikey') ] # omit spectrums that didn't qualify for analysis spec_with_precursor = [s for s in spec_with_precursor if s is not None] inchi_dict = {} for s in spec_with_precursor: ik = s.metadata['inchikey'] init_ik = ik.split('-')[0] if not init_ik in inchi_dict: inchi_dict[init_ik] = [s] else: inchi_dict[init_ik].append(s) multis = set( [i for i, v in inchi_dict.items() if len(v) > min_identical_matches]) matching_keys = np.random.choice(list(multis), size=query_size, replace=False) query_spec = {} library_spec = [] # We select `query_size` queries that have at least `min_identical_matches` matching spectra in the library, for q in matching_keys: spec_to_add = np.random.choice(inchi_dict[q], size=1, replace=False) query_spec[spec_to_add[0].metadata['spectrum_id']] = spec_to_add[0] # And everything else goes into the library for s in spec_with_precursor: if s.metadata['spectrum_id'] not in query_spec: library_spec.append(s) spectrums_lib = library_spec spectrums_query = list(query_spec.values()) documents_lib = [ SpectrumDocument(s, n_decimals=spec2vec_decimal_places) for s in spectrums_lib ] documents_query = [ SpectrumDocument(s, n_decimals=spec2vec_decimal_places) for s in spectrums_query ]
def test_train_new_word2vec_model_with_logger_and_saving(tmp_path): """Test training of a dummy model and save it.""" # Create fake corpus documents = [] for i in range(100): spectrum = Spectrum(mz=numpy.linspace(i, 9+i, 10), intensities=numpy.ones((10)).astype("float"), metadata={}) documents.append(SpectrumDocument(spectrum, n_decimals=1)) # Train model and write to file filename = os.path.join(tmp_path, "test.model") model = train_new_word2vec_model(documents, iterations=20, filename=filename, size=20, progress_logger=True) # Test if file exists assert os.path.isfile(filename), "Could not find saved model file." # Test if saved model seems to be correct model = gensim.models.Word2Vec.load(filename) assert model.sg == 0, "Expected different default value." assert model.negative == 5, "Expected different default value." assert model.window == 500, "Expected different default value." assert model.alpha == 0.025, "Expected different default value." assert model.min_alpha == 0.02, "Expected different default value." assert model.epochs == 20, "Expected differnt number of epochs." assert model.wv.vector_size == 20, "Expected differnt vector size." assert len(model.wv.vocab) == 109, "Expected different number of words in vocab." assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size."
def create_md_spectrum_documents( md_documents: List[List[Tuple[str, List[float], int]]], spectrums_processed: List[SpectrumType], set_white_listed_mds: set, set_chosen_mds: set, c_multiply: bool, punish_intensities: bool, require_in_count: int) -> List[SpectrumDocument]: """Make SpectrumDocuments for spectra with MDs Parameters ---------- md_documents: List of 'documents' which are a tuple of (md, [intensities], count) spectrums_processed: List of normally processed spectra for Spec2Vec set_white_listed_mds: Set of MDs to always use without additional filtering like require_in_count set_chosen_mds: Set of MDs to use c_multiply: Multiply intensities with sqrt of count punish_intensities: Divide MD intensities by 2 require_in_count: Require X MDs to be present in spectrum for it to count, e.a. 2 """ md_spectrum_documents = [] for md_doc, spec in zip(md_documents, spectrums_processed): new_doc = SpectrumDocument(spec.clone(), n_decimals=2) processed_mds = [] for md in md_doc: proc_md = False if md[0] in set_white_listed_mds: # if md present in both sets, this will happen first proc_md = convert_md_tup(md, count_multiplier=c_multiply, punish=punish_intensities, in_count_cutoff=1) elif md[0] in set_chosen_mds: proc_md = convert_md_tup(md, count_multiplier=c_multiply, punish=punish_intensities, in_count_cutoff=require_in_count) if proc_md: processed_mds.append(proc_md) if processed_mds: md_words, md_intensities = zip(*processed_mds) new_doc.words.extend(md_words) new_doc.weights.extend(md_intensities) assert len(new_doc.words) == len(new_doc.weights) md_spectrum_documents.append(new_doc) return md_spectrum_documents
def test_spec2vec_pair_method(): """Test if pair of two SpectrumDocuments is handled correctly""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) documents = [SpectrumDocument(s) for s in [spectrum_1, spectrum_2]] model = load_test_model() spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) score01 = spec2vec.pair(documents[0], documents[1]) assert score01 == pytest.approx(0.9936808, 1e-6) score11 = spec2vec.pair(documents[1], documents[1]) assert score11 == pytest.approx(1.0, 1e-9)
def test_train_new_word2vec_model_wrong_entry(): """Test training of a dummy model with not-accepted gensim argument entry.""" # Create fake corpus documents = [] for i in range(10): spectrum = Spectrum(mz=numpy.linspace(i, 9+i, 10), intensities=numpy.ones((10)).astype("float"), metadata={}) documents.append(SpectrumDocument(spectrum, n_decimals=1)) with pytest.raises(AssertionError) as msg: _ = train_new_word2vec_model(documents, iterations=20, alpha=0.01, progress_logger=False) expected_message_part = "Expect 'learning_rate_initial' instead of 'alpha'." assert expected_message_part in str(msg.value), "Expected particular error message."
def test_calc_vector(): """Test deriving a document vector using a pretrained network.""" spectrum = Spectrum(mz=numpy.array([100, 150, 200, 250], dtype="float"), intensities=numpy.array([0.1, 0.1, 0.1, 1.0], dtype="float"), metadata={}) document = SpectrumDocument(spectrum, n_decimals=1) model = import_pretrained_model() vector = calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=1.0) expected_vector = numpy.array([ 0.08982063, -1.43037023, -0.17572929, -0.45750666, 0.44942236, 1.35530729, -1.8305029, -0.36850534, -0.28393048, -0.34192028 ]) assert numpy.all(vector == pytest.approx( expected_vector, 1e-5)), "Expected different document vector."
def test_calc_vector_higher_than_allowed_missing_percentage(): """Test using a pretrained network and a missing word percentage above allowed.""" spectrum = Spectrum(mz=numpy.array([11.1, 100, 200, 250], dtype="float"), intensities=numpy.array([0.1, 0.1, 0.1, 1.0], dtype="float"), metadata={}) document = SpectrumDocument(spectrum, n_decimals=1) model = import_pretrained_model() assert document.words[ 0] not in model.wv.vocab, "Expected word to be missing from given model." with pytest.raises(AssertionError) as msg: calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=16.0) expected_message_part = "Missing percentage is larger than set maximum." assert expected_message_part in str( msg.value), "Expected particular error message."
def test_train_new_word2vec_model(): """Test training of a dummy model.""" # Create fake corpus documents = [] for i in range(100): spectrum = Spectrum(mz=numpy.linspace(i, 9+i, 10), intensities=numpy.ones((10)).astype("float"), metadata={}) documents.append(SpectrumDocument(spectrum, n_decimals=1)) model = train_new_word2vec_model(documents, iterations=20, size=20, progress_logger=False) assert model.sg == 0, "Expected different default value." assert model.negative == 5, "Expected different default value." assert model.window == 500, "Expected different default value." assert model.alpha == 0.025, "Expected different default value." assert model.min_alpha == 0.02, "Expected different default value." assert model.epochs == 20, "Expected differnt number of epochs." assert model.wv.vector_size == 20, "Expected differnt vector size." assert len(model.wv.vocab) == 109, "Expected different number of words in vocab." assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size."
def test_calc_vector_within_allowed_missing_percentage(): """Test using a pretrained network and a missing word percentage within allowed.""" spectrum = Spectrum(mz=numpy.array([11.1, 100, 200, 250], dtype="float"), intensities=numpy.array([0.1, 0.1, 0.1, 1.0], dtype="float"), metadata={}) document = SpectrumDocument(spectrum, n_decimals=1) model = import_pretrained_model() vector = calc_vector(model, document, intensity_weighting_power=0.5, allowed_missing_percentage=17.0) expected_vector = numpy.array([ 0.12775915, -1.17673617, -0.14598507, -0.40189132, 0.36908966, 1.11608575, -1.46774333, -0.31442554, -0.23168877, -0.29420064 ]) assert document.words[ 0] not in model.wv.vocab, "Expected word to be missing from given model." assert numpy.all(vector == pytest.approx( expected_vector, 1e-5)), "Expected different document vector."
def test_spec2vec_matrix_method(): """Test if matrix of 2x2 SpectrumDocuments is handled correctly""" spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]), intensities=numpy.array([0.7, 0.2, 0.1]), metadata={'id': 'spectrum1'}) spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]), intensities=numpy.array([0.4, 0.2, 0.1]), metadata={'id': 'spectrum2'}) documents = [SpectrumDocument(s) for s in [spectrum_1, spectrum_2]] model = load_test_model() spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5) scores = spec2vec.matrix(documents, documents) assert scores[0, 0] == pytest.approx(1.0, 1e-9), "Expected different score." assert scores[1, 1] == pytest.approx(1.0, 1e-9), "Expected different score." assert scores[1, 0] == pytest.approx(0.9936808, 1e-6), "Expected different score." assert scores[0, 1] == pytest.approx(0.9936808, 1e-6), "Expected different score."
def create_spectrum_documents( query_spectra: List[Spectrum], progress_bar: bool = False, nr_of_decimals: int = 2) -> List[SpectrumDocument]: """Transforms list of Spectrum to List of SpectrumDocument Args ------ query_spectra: List of Spectrum objects that are transformed to SpectrumDocument progress_bar: When true a progress bar is shown. Default = False nr_of_decimals: The number of decimals used for binning the peaks. """ spectrum_documents = [] for spectrum in tqdm(query_spectra, desc="Converting Spectrum to Spectrum_document", disable=not progress_bar): post_process_spectrum = spectrum_processing_s2v(spectrum) spectrum_documents.append( SpectrumDocument(post_process_spectrum, n_decimals=nr_of_decimals)) return spectrum_documents
def plot_spectra_comparison(spectrum1_in, spectrum2_in, model, intensity_weighting_power=0.5, num_decimals=2, min_mz=5, max_mz=500, intensity_threshold=0.01, method='cosine', tolerance=0.005, wordsim_cutoff=0.5, circle_size=5, circle_scaling='wordsim', padding=10, display_molecules=False, figsize=(12, 12), filename=None): """ In-depth visual comparison of spectral similarity scores, calculated based on cosine/mod.cosine and Spev2Vec. Parameters ---------- method: str 'cosine' or 'modcos' (modified cosine score) circle_scaling: str Scale circles based on 'wordsim' or 'peak_product' """ def apply_filters(s): s = normalize_intensities(s) s = select_by_mz(s, mz_from=min_mz, mz_to=max_mz) s = select_by_relative_intensity(s, intensity_from=intensity_threshold) s.losses = None return s spectrum1 = apply_filters(spectrum1_in) spectrum2 = apply_filters(spectrum2_in) plt.style.use("seaborn-white")#('ggplot') plot_colors = ['darkcyan', 'purple'] # Definitions for the axes left, width = 0.1, 0.6 bottom, height = 0.1, 0.6 spacing = 0.01 rect_wordsim = [left, bottom, width, height] rect_specx = [left, bottom + height + spacing, width, 0.2] rect_specy = [left + width + spacing, bottom, 0.25, height] document1 = SpectrumDocument(spectrum1, n_decimals=num_decimals) document2 = SpectrumDocument(spectrum2, n_decimals=num_decimals) # Remove words/peaks that are not in dictionary select1 = np.asarray([i for i, word in enumerate(document1.words) if word in model.wv.vocab]) select2 = np.asarray([i for i, word in enumerate(document2.words) if word in model.wv.vocab]) peaks1 = np.asarray(spectrum1.peaks[:]).T peaks2 = np.asarray(spectrum2.peaks[:]).T peaks1 = peaks1[select1, :] peaks2 = peaks2[select2, :] min_peaks1 = np.min(peaks1[:, 0]) min_peaks2 = np.min(peaks2[:, 0]) max_peaks1 = np.max(peaks1[:, 0]) max_peaks2 = np.max(peaks2[:, 0]) possible_grid_points = np.arange(0, 2000, 50) grid_points1 = possible_grid_points[(possible_grid_points > min_peaks1 - padding) \ & (possible_grid_points < max_peaks1 + padding)] grid_points2 = possible_grid_points[(possible_grid_points > min_peaks2 - padding) \ & (possible_grid_points < max_peaks2 + padding)] word_vectors1 = model.wv[[document1.words[x] for x in select1]] word_vectors2 = model.wv[[document2.words[x] for x in select2]] csim_words = 1 - spatial.distance.cdist(word_vectors1, word_vectors2, 'cosine') csim_words[csim_words < wordsim_cutoff] = 0 # Remove values below cutoff print(np.min(csim_words), np.max(csim_words)) # Plot spectra # ------------------------------------------------------------------------- fig = plt.figure(figsize=figsize) # Word similariy plot (central) ax_wordsim = plt.axes(rect_wordsim) ax_wordsim.tick_params(direction='in', top=True, right=True) # Spectra plot (top) ax_specx = plt.axes(rect_specx) ax_specx.tick_params(direction='in', labelbottom=False) # Spectra plot 2 (right) ax_specy = plt.axes(rect_specy) ax_specy.tick_params(direction='in', labelleft=False) # Spec2Vec similarity plot: # ------------------------------------------------------------------------- data_x = [] data_y = [] data_z = [] data_peak_product = [] for i in range(len(select1)): for j in range(len(select2)): data_x.append(peaks1[i, 0]) data_y.append(peaks2[j, 0]) data_z.append(csim_words[i, j]) data_peak_product.append(peaks1[i, 1] * peaks2[j, 1]) # Sort by word similarity data_x = np.array(data_x) data_y = np.array(data_y) data_z = np.array(data_z) data_peak_product = np.array(data_peak_product) idx = np.lexsort((data_x, data_y, data_z)) cm = plt.cm.get_cmap('RdYlBu_r') # 'YlOrRd') #'RdBu_r') # Plot word similarities if circle_scaling == 'peak_product': wordsimplot = ax_wordsim.scatter(data_x[idx], data_y[idx], s=100 * circle_size * (0.01 + data_peak_product[idx]**2), marker="o", c=data_z[idx], cmap=cm, alpha=0.6) elif circle_scaling == 'wordsim': wordsimplot = ax_wordsim.scatter(data_x[idx], data_y[idx], s=100 * circle_size * (0.01 + data_z[idx]**2), marker="o", c=data_z[idx], cmap=cm, alpha=0.6) # (Modified) Cosine similarity plot: # ------------------------------------------------------------------------- if method == 'cosine': score_classical, used_matches = cosine_score(spectrum1, spectrum2, tolerance, modified_cosine=False) elif method == 'modcos': score_classical, used_matches = cosine_score(spectrum1, spectrum2, tolerance, modified_cosine=True) else: print("Given method unkown.") idx1, idx2, _ = zip(*used_matches) cosine_x = [] cosine_y = [] for i in range(len(idx1)): if idx1[i] in select1 and idx2[i] in select2: cosine_x.append(peaks1[idx1[i], 0]) cosine_y.append(peaks2[idx2[i], 0]) # Plot (mod.) cosine similarities ax_wordsim.scatter(cosine_x, cosine_y, s=100, c='black', marker=(5, 2)) ax_wordsim.set_xlim(min_peaks1 - padding, max_peaks1 + padding) ax_wordsim.set_ylim(min_peaks2 - padding, max_peaks2 + padding) ax_wordsim.set_xlabel('spectrum 1 - fragment mz', fontsize=16) ax_wordsim.set_ylabel('spectrum 2 - fragment mz', fontsize=16) ax_wordsim.tick_params(labelsize=13) ax_wordsim.set_xticks(grid_points1) ax_wordsim.set_yticks(grid_points2) ax_wordsim.grid(True) # Plot spectra 1 ax_specx.vlines(peaks1[:, 0], [0], peaks1[:, 1], color=plot_colors[0]) ax_specx.plot(peaks1[:, 0], peaks1[:, 1], '.') # Stem ends ax_specx.plot([peaks1[:, 0].max(), peaks1[:, 0].min()], [0, 0], '--') # Middle bar ax_specx.set_xlim(min_peaks1 - padding, max_peaks1 + padding) ax_specx.set_yticks([0,0.25,0.5,0.75,1]) ax_specx.set_xticks(grid_points1) ax_specx.set_ylabel('peak intensity (relative)', fontsize=16) ax_specx.tick_params(labelsize=13) ax_specx.grid(True) # Plot spectra 2 ax_specy.hlines(peaks2[:, 0], [0], peaks2[:, 1], color=plot_colors[1]) ax_specy.plot(peaks2[:, 1], peaks2[:, 0], '.') # Stem ends ax_specy.plot([0, 0], [peaks2[:, 0].min(), peaks2[:, 0].max()], '--') # Middle bar ax_specy.set_ylim(min_peaks2 - padding, max_peaks2 + padding) ax_specy.set_xticks([0,0.25,0.5,0.75,1]) ax_specy.set_yticks(grid_points2) ax_specy.set_xlabel('peak intensity (relative)', fontsize=16) ax_specy.tick_params(labelsize=13) ax_specy.grid(True) fig.colorbar(wordsimplot, ax=ax_specy) if filename is not None: plt.savefig(filename) plt.show() # Plot molecules # ------------------------------------------------------------------------- if display_molecules: smiles = [spectrum1.get("smiles"), spectrum2.get("smiles")] molecules = [Chem.MolFromSmiles(x) for x in smiles] display(Draw.MolsToGridImage(molecules, molsPerRow=2, subImgSize=(400, 400)))
all_unfiltered_mds = set() for md_doc in md_documents: for md in md_doc: all_unfiltered_mds.add(md[0]) print(f"\n{len(all_unfiltered_mds)} unfiltered MDs present") print(f"{len(md_documents)} remaining MD documents (spectra).") print("An example:", md_documents[-1]) if not mds_to_use: mds_to_use = all_unfiltered_mds # validation pipeline spectrums_top30, spectrums_processed, spectrums_classical = processing_res # make SpectrumDocuments documents_processed = [ SpectrumDocument(s, n_decimals=2) for i, s in enumerate(spectrums_processed) ] documents_classical = [ SpectrumDocument(s, n_decimals=2) for i, s in enumerate(spectrums_classical) ] # create md SpectrumDocuments set_white_listed_mds = set(white_listed_mds) set_chosen_mds = set(mds_to_use) c_multiply = cmd.no_count_benefit # multiply intensities with sqrt(count) md_spectrum_documents = create_md_spectrum_documents( md_documents, spectrums_processed, set_white_listed_mds, set_chosen_mds, c_multiply, cmd.punish_intensities, cmd.require_in_count) spec_docs_mds_file = os.path.join(cmd.output_dir,
def main(): parser = argparse.ArgumentParser(description='Creating Spec2Vec Pairs') parser.add_argument('input_mgf', help='input_mgf') parser.add_argument('output_pairs', help='output_pairs') parser.add_argument('model_file', help='model_file') parser.add_argument('--min_score', type=float, default=0.7, help='model_file') args = parser.parse_args() spectra = load_from_mgf(args.input_mgf) filtered_spectra = [post_process(s) for s in spectra] # Omit spectrums that didn't qualify for analysis filtered_spectra = [s for s in filtered_spectra if s is not None] # Create spectrum documents query_documents = [ SpectrumDocument(s, n_decimals=2) for s in filtered_spectra ] #DEBUG #query_documents = query_documents[:100] # Loading the model model = gensim.models.Word2Vec.load(args.model_file) # Define similarity_function spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5, allowed_missing_percentage=80.0) print("total documents", len(query_documents)) scores = calculate_scores(query_documents, query_documents, spec2vec).scores number_of_spectra = len(query_documents) output_scores_list = [] for i in range(number_of_spectra): for j in range(number_of_spectra): if i <= j: continue i_spectrum = filtered_spectra[i] j_spectrum = filtered_spectra[j] sim = scores[i][j] if sim < args.min_score: continue score_dict = {} score_dict["filename"] = args.input_mgf score_dict["CLUSTERID1"] = i_spectrum.metadata["scans"] score_dict["CLUSTERID2"] = j_spectrum.metadata["scans"] score_dict["Cosine"] = sim score_dict["mz1"] = i_spectrum.metadata["pepmass"][0] score_dict["mz2"] = j_spectrum.metadata["pepmass"][0] score_dict["DeltaMZ"] = score_dict["mz2"] - score_dict["mz1"] score_dict["EdgeAnnotation"] = "Spec2Vec" output_scores_list.append(score_dict) # Saving Data Out pd.DataFrame(output_scores_list).to_csv(args.output_pairs, sep="\t", index=False)
def test_user_workflow_spec2vec_parallel(): """Test typical user workflow to get from mass spectra to spec2vec similarities. This test will run a typical workflow example using a small dataset and a pretrained word2vec model. One main aspect of this is to test if users will get exactly the same spec2vec similarity scores when starting from a word2vec model that was trained and saved elsewhere. """ def apply_my_filters(s): """This is how a user would typically design his own pre- and post- processing pipeline.""" s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) s = select_by_mz(s, mz_from=0, mz_to=1000) s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s repository_root = os.path.join(os.path.dirname(__file__), "..") spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf") # apply my filters to the data spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)] # omit spectrums that didn't qualify for analysis spectrums = [s for s in spectrums if s is not None] documents = [SpectrumDocument(s) for s in spectrums] model_file = os.path.join(repository_root, "integration-tests", "test_user_workflow_spec2vec.model") if os.path.isfile(model_file): model = gensim.models.Word2Vec.load(model_file) else: # create and train model model = gensim.models.Word2Vec([d.words for d in documents], size=10, min_count=1) model.train([d.words for d in documents], total_examples=len(documents), epochs=20) model.save(model_file) # define similarity_function spec2vec = Spec2VecParallel(model=model, intensity_weighting_power=0.5) references = documents[:26] queries = documents[25:] # calculate scores on all combinations of references and queries scores = list(calculate_scores_parallel(references, queries, spec2vec)) # filter out self-comparisons filtered = [(reference, query, score) for (reference, query, score) in scores if reference != query] sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True) actual_top10 = sorted_by_score[:10] expected_top10 = [(documents[19], documents[25], pytest.approx(0.9999121928249473, rel=1e-9)), (documents[20], documents[25], pytest.approx(0.9998846890269892, rel=1e-9)), (documents[20], documents[45], pytest.approx(0.9998756073673759, rel=1e-9)), (documents[25], documents[45], pytest.approx(0.9998750427994474, rel=1e-9)), (documents[19], documents[27], pytest.approx(0.9998722768460854, rel=1e-9)), (documents[22], documents[27], pytest.approx(0.9998633023352553, rel=1e-9)), (documents[18], documents[27], pytest.approx(0.9998616961532616, rel=1e-9)), (documents[19], documents[45], pytest.approx(0.9998528723697396, rel=1e-9)), (documents[14], documents[71], pytest.approx(0.9998404364805897, rel=1e-9)), (documents[20], documents[27], pytest.approx(0.9998336807761137, rel=1e-9))] assert actual_top10 == expected_top10, "Expected different top 10 table."
s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s # path_root = os.path.dirname(os.getcwd()) # path_data = os.path.join(os.path.dirname(path_root), "data/gnps_15_12_2021/") path_data = "C:\\HSD\\OneDrive - Hochschule Düsseldorf\\Data\\ms2query" training_spectra_annotated = load_pickled_file(os.path.join(path_data, "GNPS_15_12_2021_pos_train.pickle")) training_spectra_not_annotated = load_pickled_file(os.path.join(path_data, "ALL_GNPS_15_12_2021_positive_not_annotated.pickle")) all_spectra = training_spectra_annotated + training_spectra_not_annotated # Load data from pickled file and apply filters cleaned_spectra = [spectrum_processing(s) for s in all_spectra] # Omit spectrums that didn't qualify for analysis cleaned_spectra = [s for s in cleaned_spectra if s is not None] # Create spectrum documents reference_documents = [SpectrumDocument(s, n_decimals=2) for s in cleaned_spectra] model_file = os.path.join(path_data, "trained_models", "spec2vec_model_GNPS_15_12_2021.model") model = train_new_word2vec_model(reference_documents, iterations=[10, 20, 30], filename=model_file, workers=4, progress_logger=True)