def init(query_size=1000,
         min_identical_matches=1,
         spec2vec_decimal_places=2,
         folder_name='C:\\Users\\Gosia\\Desktop'):
    global documents_query
    global documents_lib
    global spectrums_lib
    global spectrums_query

    spec_with_precursor = load_from_json(
        r'C:\Users\Gosia\Desktop\gnps_positive_ionmode_cleaned_by_matchms_and_lookups.json'
    )
    # apply post processing steps to the data
    spec_with_precursor = [
        post_process(s) for s in spec_with_precursor
        if s.metadata.get('inchikey')
    ]
    # omit spectrums that didn't qualify for analysis
    spec_with_precursor = [s for s in spec_with_precursor if s is not None]

    inchi_dict = {}
    for s in spec_with_precursor:
        ik = s.metadata['inchikey']
        init_ik = ik.split('-')[0]
        if not init_ik in inchi_dict:
            inchi_dict[init_ik] = [s]
        else:
            inchi_dict[init_ik].append(s)

    multis = set(
        [i for i, v in inchi_dict.items() if len(v) > min_identical_matches])

    matching_keys = np.random.choice(list(multis),
                                     size=query_size,
                                     replace=False)

    query_spec = {}
    library_spec = []
    # We select `query_size` queries that have at least `min_identical_matches` matching spectra in the library,
    for q in matching_keys:
        spec_to_add = np.random.choice(inchi_dict[q], size=1, replace=False)
        query_spec[spec_to_add[0].metadata['spectrum_id']] = spec_to_add[0]

    # And everything else goes into the library
    for s in spec_with_precursor:
        if s.metadata['spectrum_id'] not in query_spec:
            library_spec.append(s)

    spectrums_lib = library_spec
    spectrums_query = list(query_spec.values())
    documents_lib = [
        SpectrumDocument(s, n_decimals=spec2vec_decimal_places)
        for s in spectrums_lib
    ]
    documents_query = [
        SpectrumDocument(s, n_decimals=spec2vec_decimal_places)
        for s in spectrums_query
    ]
Example #2
0
def test_train_new_word2vec_model_with_logger_and_saving(tmp_path):
    """Test training of a dummy model and save it."""
    # Create fake corpus
    documents = []
    for i in range(100):
        spectrum = Spectrum(mz=numpy.linspace(i, 9+i, 10),
                            intensities=numpy.ones((10)).astype("float"),
                            metadata={})
        documents.append(SpectrumDocument(spectrum, n_decimals=1))
    # Train model and write to file
    filename = os.path.join(tmp_path, "test.model")
    model = train_new_word2vec_model(documents, iterations=20, filename=filename,
                                     size=20, progress_logger=True)

    # Test if file exists
    assert os.path.isfile(filename), "Could not find saved model file."

    # Test if saved model seems to be correct
    model = gensim.models.Word2Vec.load(filename)
    assert model.sg == 0, "Expected different default value."
    assert model.negative == 5, "Expected different default value."
    assert model.window == 500, "Expected different default value."
    assert model.alpha == 0.025, "Expected different default value."
    assert model.min_alpha == 0.02, "Expected different default value."
    assert model.epochs == 20, "Expected differnt number of epochs."
    assert model.wv.vector_size == 20, "Expected differnt vector size."
    assert len(model.wv.vocab) == 109, "Expected different number of words in vocab."
    assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size."
Example #3
0
def create_md_spectrum_documents(
        md_documents: List[List[Tuple[str, List[float], int]]],
        spectrums_processed: List[SpectrumType], set_white_listed_mds: set,
        set_chosen_mds: set, c_multiply: bool, punish_intensities: bool,
        require_in_count: int) -> List[SpectrumDocument]:
    """Make SpectrumDocuments for spectra with MDs

    Parameters
    ----------
    md_documents:
        List of 'documents' which are a tuple of (md, [intensities], count)
    spectrums_processed:
        List of normally processed spectra for Spec2Vec
    set_white_listed_mds:
        Set of MDs to always use without additional filtering like
        require_in_count
    set_chosen_mds:
        Set of MDs to use
    c_multiply:
        Multiply intensities with sqrt of count
    punish_intensities:
        Divide MD intensities by 2
    require_in_count:
        Require X MDs to be present in spectrum for it to count, e.a. 2
    """
    md_spectrum_documents = []
    for md_doc, spec in zip(md_documents, spectrums_processed):
        new_doc = SpectrumDocument(spec.clone(), n_decimals=2)

        processed_mds = []
        for md in md_doc:
            proc_md = False
            if md[0] in set_white_listed_mds:
                # if md present in both sets, this will happen first
                proc_md = convert_md_tup(md,
                                         count_multiplier=c_multiply,
                                         punish=punish_intensities,
                                         in_count_cutoff=1)
            elif md[0] in set_chosen_mds:
                proc_md = convert_md_tup(md,
                                         count_multiplier=c_multiply,
                                         punish=punish_intensities,
                                         in_count_cutoff=require_in_count)

            if proc_md:
                processed_mds.append(proc_md)

        if processed_mds:
            md_words, md_intensities = zip(*processed_mds)
            new_doc.words.extend(md_words)
            new_doc.weights.extend(md_intensities)
        assert len(new_doc.words) == len(new_doc.weights)
        md_spectrum_documents.append(new_doc)
    return md_spectrum_documents
Example #4
0
def test_spec2vec_pair_method():
    """Test if pair of two SpectrumDocuments is handled correctly"""
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})

    documents = [SpectrumDocument(s) for s in [spectrum_1, spectrum_2]]
    model = load_test_model()
    spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
    score01 = spec2vec.pair(documents[0], documents[1])
    assert score01 == pytest.approx(0.9936808, 1e-6)
    score11 = spec2vec.pair(documents[1], documents[1])
    assert score11 == pytest.approx(1.0, 1e-9)
Example #5
0
def test_train_new_word2vec_model_wrong_entry():
    """Test training of a dummy model with not-accepted gensim argument entry."""
    # Create fake corpus
    documents = []
    for i in range(10):
        spectrum = Spectrum(mz=numpy.linspace(i, 9+i, 10),
                            intensities=numpy.ones((10)).astype("float"),
                            metadata={})
        documents.append(SpectrumDocument(spectrum, n_decimals=1))

    with pytest.raises(AssertionError) as msg:
        _ = train_new_word2vec_model(documents, iterations=20, alpha=0.01,
                                     progress_logger=False)

    expected_message_part = "Expect 'learning_rate_initial' instead of 'alpha'."
    assert expected_message_part in str(msg.value), "Expected particular error message."
Example #6
0
def test_calc_vector():
    """Test deriving a document vector using a pretrained network."""
    spectrum = Spectrum(mz=numpy.array([100, 150, 200, 250], dtype="float"),
                        intensities=numpy.array([0.1, 0.1, 0.1, 1.0],
                                                dtype="float"),
                        metadata={})

    document = SpectrumDocument(spectrum, n_decimals=1)
    model = import_pretrained_model()
    vector = calc_vector(model,
                         document,
                         intensity_weighting_power=0.5,
                         allowed_missing_percentage=1.0)
    expected_vector = numpy.array([
        0.08982063, -1.43037023, -0.17572929, -0.45750666, 0.44942236,
        1.35530729, -1.8305029, -0.36850534, -0.28393048, -0.34192028
    ])
    assert numpy.all(vector == pytest.approx(
        expected_vector, 1e-5)), "Expected different document vector."
Example #7
0
def test_calc_vector_higher_than_allowed_missing_percentage():
    """Test using a pretrained network and a missing word percentage above allowed."""
    spectrum = Spectrum(mz=numpy.array([11.1, 100, 200, 250], dtype="float"),
                        intensities=numpy.array([0.1, 0.1, 0.1, 1.0],
                                                dtype="float"),
                        metadata={})

    document = SpectrumDocument(spectrum, n_decimals=1)
    model = import_pretrained_model()
    assert document.words[
        0] not in model.wv.vocab, "Expected word to be missing from given model."
    with pytest.raises(AssertionError) as msg:
        calc_vector(model,
                    document,
                    intensity_weighting_power=0.5,
                    allowed_missing_percentage=16.0)

    expected_message_part = "Missing percentage is larger than set maximum."
    assert expected_message_part in str(
        msg.value), "Expected particular error message."
Example #8
0
def test_train_new_word2vec_model():
    """Test training of a dummy model."""
    # Create fake corpus
    documents = []
    for i in range(100):
        spectrum = Spectrum(mz=numpy.linspace(i, 9+i, 10),
                            intensities=numpy.ones((10)).astype("float"),
                            metadata={})
        documents.append(SpectrumDocument(spectrum, n_decimals=1))
    model = train_new_word2vec_model(documents, iterations=20, size=20,
                                     progress_logger=False)
    assert model.sg == 0, "Expected different default value."
    assert model.negative == 5, "Expected different default value."
    assert model.window == 500, "Expected different default value."
    assert model.alpha == 0.025, "Expected different default value."
    assert model.min_alpha == 0.02, "Expected different default value."
    assert model.epochs == 20, "Expected differnt number of epochs."
    assert model.wv.vector_size == 20, "Expected differnt vector size."
    assert len(model.wv.vocab) == 109, "Expected different number of words in vocab."
    assert model.wv.get_vector(documents[0].words[1]).shape[0] == 20, "Expected differnt vector size."
Example #9
0
def test_calc_vector_within_allowed_missing_percentage():
    """Test using a pretrained network and a missing word percentage within allowed."""
    spectrum = Spectrum(mz=numpy.array([11.1, 100, 200, 250], dtype="float"),
                        intensities=numpy.array([0.1, 0.1, 0.1, 1.0],
                                                dtype="float"),
                        metadata={})

    document = SpectrumDocument(spectrum, n_decimals=1)
    model = import_pretrained_model()
    vector = calc_vector(model,
                         document,
                         intensity_weighting_power=0.5,
                         allowed_missing_percentage=17.0)
    expected_vector = numpy.array([
        0.12775915, -1.17673617, -0.14598507, -0.40189132, 0.36908966,
        1.11608575, -1.46774333, -0.31442554, -0.23168877, -0.29420064
    ])
    assert document.words[
        0] not in model.wv.vocab, "Expected word to be missing from given model."
    assert numpy.all(vector == pytest.approx(
        expected_vector, 1e-5)), "Expected different document vector."
Example #10
0
def test_spec2vec_matrix_method():
    """Test if matrix of 2x2 SpectrumDocuments is handled correctly"""
    spectrum_1 = Spectrum(mz=numpy.array([100, 150, 200.]),
                          intensities=numpy.array([0.7, 0.2, 0.1]),
                          metadata={'id': 'spectrum1'})
    spectrum_2 = Spectrum(mz=numpy.array([100, 140, 190.]),
                          intensities=numpy.array([0.4, 0.2, 0.1]),
                          metadata={'id': 'spectrum2'})

    documents = [SpectrumDocument(s) for s in [spectrum_1, spectrum_2]]
    model = load_test_model()
    spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5)
    scores = spec2vec.matrix(documents, documents)
    assert scores[0, 0] == pytest.approx(1.0,
                                         1e-9), "Expected different score."
    assert scores[1, 1] == pytest.approx(1.0,
                                         1e-9), "Expected different score."
    assert scores[1, 0] == pytest.approx(0.9936808,
                                         1e-6), "Expected different score."
    assert scores[0, 1] == pytest.approx(0.9936808,
                                         1e-6), "Expected different score."
Example #11
0
def create_spectrum_documents(
        query_spectra: List[Spectrum],
        progress_bar: bool = False,
        nr_of_decimals: int = 2) -> List[SpectrumDocument]:
    """Transforms list of Spectrum to List of SpectrumDocument

    Args
    ------
    query_spectra:
        List of Spectrum objects that are transformed to SpectrumDocument
    progress_bar:
        When true a progress bar is shown. Default = False
    nr_of_decimals:
        The number of decimals used for binning the peaks.
    """
    spectrum_documents = []
    for spectrum in tqdm(query_spectra,
                         desc="Converting Spectrum to Spectrum_document",
                         disable=not progress_bar):
        post_process_spectrum = spectrum_processing_s2v(spectrum)
        spectrum_documents.append(
            SpectrumDocument(post_process_spectrum, n_decimals=nr_of_decimals))
    return spectrum_documents
Example #12
0
def plot_spectra_comparison(spectrum1_in, spectrum2_in,
                            model,
                            intensity_weighting_power=0.5,
                            num_decimals=2,
                            min_mz=5,
                            max_mz=500,
                            intensity_threshold=0.01,
                            method='cosine',
                            tolerance=0.005,
                            wordsim_cutoff=0.5,
                            circle_size=5,
                            circle_scaling='wordsim',
                            padding=10,
                            display_molecules=False,
                            figsize=(12, 12),
                            filename=None):
    """ In-depth visual comparison of spectral similarity scores,
    calculated based on cosine/mod.cosine and Spev2Vec.

    Parameters
    ----------
    method: str
        'cosine' or 'modcos' (modified cosine score)
    circle_scaling: str
        Scale circles based on 'wordsim' or 'peak_product'
    """

    def apply_filters(s):
        s = normalize_intensities(s)
        s = select_by_mz(s, mz_from=min_mz, mz_to=max_mz)
        s = select_by_relative_intensity(s, intensity_from=intensity_threshold)
        s.losses = None
        return s

    spectrum1 = apply_filters(spectrum1_in)
    spectrum2 = apply_filters(spectrum2_in)

    plt.style.use("seaborn-white")#('ggplot')
    plot_colors = ['darkcyan', 'purple']

    # Definitions for the axes
    left, width = 0.1, 0.6
    bottom, height = 0.1, 0.6
    spacing = 0.01

    rect_wordsim = [left, bottom, width, height]
    rect_specx = [left, bottom + height + spacing, width, 0.2]
    rect_specy = [left + width + spacing, bottom, 0.25, height]

    document1 = SpectrumDocument(spectrum1, n_decimals=num_decimals)
    document2 = SpectrumDocument(spectrum2, n_decimals=num_decimals)

    # Remove words/peaks that are not in dictionary
    select1 = np.asarray([i for i, word in enumerate(document1.words) if word in model.wv.vocab])
    select2 = np.asarray([i for i, word in enumerate(document2.words) if word in model.wv.vocab])
    peaks1 = np.asarray(spectrum1.peaks[:]).T
    peaks2 = np.asarray(spectrum2.peaks[:]).T
    peaks1 = peaks1[select1, :]
    peaks2 = peaks2[select2, :]
    min_peaks1 = np.min(peaks1[:, 0])
    min_peaks2 = np.min(peaks2[:, 0])
    max_peaks1 = np.max(peaks1[:, 0])
    max_peaks2 = np.max(peaks2[:, 0])
    possible_grid_points = np.arange(0, 2000, 50)
    grid_points1 = possible_grid_points[(possible_grid_points > min_peaks1 - padding) \
                                        & (possible_grid_points < max_peaks1 + padding)]
    grid_points2 = possible_grid_points[(possible_grid_points > min_peaks2 - padding) \
                                        & (possible_grid_points < max_peaks2 + padding)]

    word_vectors1 = model.wv[[document1.words[x] for x in select1]]
    word_vectors2 = model.wv[[document2.words[x] for x in select2]]

    csim_words = 1 - spatial.distance.cdist(word_vectors1, word_vectors2, 'cosine')
    csim_words[csim_words < wordsim_cutoff] = 0  # Remove values below cutoff
    print(np.min(csim_words), np.max(csim_words))

    # Plot spectra
    # -------------------------------------------------------------------------
    fig = plt.figure(figsize=figsize)
    # Word similariy plot (central)
    ax_wordsim = plt.axes(rect_wordsim)
    ax_wordsim.tick_params(direction='in', top=True, right=True)
    # Spectra plot (top)
    ax_specx = plt.axes(rect_specx)
    ax_specx.tick_params(direction='in', labelbottom=False)
    # Spectra plot 2 (right)
    ax_specy = plt.axes(rect_specy)
    ax_specy.tick_params(direction='in', labelleft=False)

    # Spec2Vec similarity plot:
    # -------------------------------------------------------------------------
    data_x = []
    data_y = []
    data_z = []
    data_peak_product = []
    for i in range(len(select1)):
        for j in range(len(select2)):
            data_x.append(peaks1[i, 0])
            data_y.append(peaks2[j, 0])
            data_z.append(csim_words[i, j])
            data_peak_product.append(peaks1[i, 1] * peaks2[j, 1])

    # Sort by word similarity
    data_x = np.array(data_x)
    data_y = np.array(data_y)
    data_z = np.array(data_z)
    data_peak_product = np.array(data_peak_product)
    idx = np.lexsort((data_x, data_y, data_z))

    cm = plt.cm.get_cmap('RdYlBu_r')  # 'YlOrRd') #'RdBu_r')

    # Plot word similarities
    if circle_scaling == 'peak_product':
        wordsimplot = ax_wordsim.scatter(data_x[idx],
                                         data_y[idx],
                                         s=100 * circle_size *
                                         (0.01 + data_peak_product[idx]**2),
                                         marker="o",
                                         c=data_z[idx],
                                         cmap=cm,
                                         alpha=0.6)
    elif circle_scaling == 'wordsim':
        wordsimplot = ax_wordsim.scatter(data_x[idx],
                                         data_y[idx],
                                         s=100 * circle_size *
                                         (0.01 + data_z[idx]**2),
                                         marker="o",
                                         c=data_z[idx],
                                         cmap=cm,
                                         alpha=0.6)

    # (Modified) Cosine similarity plot:
    # -------------------------------------------------------------------------
    if method == 'cosine':
        score_classical, used_matches = cosine_score(spectrum1, spectrum2, tolerance, modified_cosine=False)
    elif method == 'modcos':
        score_classical, used_matches = cosine_score(spectrum1, spectrum2, tolerance, modified_cosine=True)
    else:
        print("Given method unkown.")

    idx1, idx2, _ = zip(*used_matches)
    cosine_x = []
    cosine_y = []
    for i in range(len(idx1)):
        if idx1[i] in select1 and idx2[i] in select2:
            cosine_x.append(peaks1[idx1[i], 0])
            cosine_y.append(peaks2[idx2[i], 0])

    # Plot (mod.) cosine similarities
    ax_wordsim.scatter(cosine_x, cosine_y, s=100, c='black', marker=(5, 2))
    ax_wordsim.set_xlim(min_peaks1 - padding, max_peaks1 + padding)
    ax_wordsim.set_ylim(min_peaks2 - padding, max_peaks2 + padding)
    ax_wordsim.set_xlabel('spectrum 1 - fragment mz', fontsize=16)
    ax_wordsim.set_ylabel('spectrum 2 - fragment mz', fontsize=16)
    ax_wordsim.tick_params(labelsize=13)
    ax_wordsim.set_xticks(grid_points1)
    ax_wordsim.set_yticks(grid_points2)
    ax_wordsim.grid(True)

    # Plot spectra 1
    ax_specx.vlines(peaks1[:, 0], [0], peaks1[:, 1], color=plot_colors[0])
    ax_specx.plot(peaks1[:, 0], peaks1[:, 1], '.')  # Stem ends
    ax_specx.plot([peaks1[:, 0].max(), peaks1[:, 0].min()], [0, 0],
                  '--')  # Middle bar
    ax_specx.set_xlim(min_peaks1 - padding, max_peaks1 + padding)
    ax_specx.set_yticks([0,0.25,0.5,0.75,1])
    ax_specx.set_xticks(grid_points1)
    ax_specx.set_ylabel('peak intensity (relative)', fontsize=16)
    ax_specx.tick_params(labelsize=13)

    ax_specx.grid(True)

    # Plot spectra 2
    ax_specy.hlines(peaks2[:, 0], [0], peaks2[:, 1], color=plot_colors[1])
    ax_specy.plot(peaks2[:, 1], peaks2[:, 0], '.')  # Stem ends
    ax_specy.plot([0, 0], [peaks2[:, 0].min(), peaks2[:, 0].max()],
                  '--')  # Middle bar
    ax_specy.set_ylim(min_peaks2 - padding, max_peaks2 + padding)
    ax_specy.set_xticks([0,0.25,0.5,0.75,1])
    ax_specy.set_yticks(grid_points2)
    ax_specy.set_xlabel('peak intensity (relative)', fontsize=16)
    ax_specy.tick_params(labelsize=13)

    ax_specy.grid(True)

    fig.colorbar(wordsimplot, ax=ax_specy)
    if filename is not None:
        plt.savefig(filename)
    plt.show()

    # Plot molecules
    # -------------------------------------------------------------------------
    if display_molecules:
        smiles = [spectrum1.get("smiles"), spectrum2.get("smiles")]
        molecules = [Chem.MolFromSmiles(x) for x in smiles]
        display(Draw.MolsToGridImage(molecules, molsPerRow=2, subImgSize=(400, 400)))
    all_unfiltered_mds = set()
    for md_doc in md_documents:
        for md in md_doc:
            all_unfiltered_mds.add(md[0])
    print(f"\n{len(all_unfiltered_mds)} unfiltered MDs present")
    print(f"{len(md_documents)} remaining MD documents (spectra).")
    print("An example:", md_documents[-1])
    if not mds_to_use:
        mds_to_use = all_unfiltered_mds

    # validation pipeline
    spectrums_top30, spectrums_processed, spectrums_classical = processing_res

    # make SpectrumDocuments
    documents_processed = [
        SpectrumDocument(s, n_decimals=2)
        for i, s in enumerate(spectrums_processed)
    ]
    documents_classical = [
        SpectrumDocument(s, n_decimals=2)
        for i, s in enumerate(spectrums_classical)
    ]
    # create md SpectrumDocuments
    set_white_listed_mds = set(white_listed_mds)
    set_chosen_mds = set(mds_to_use)
    c_multiply = cmd.no_count_benefit  # multiply intensities with sqrt(count)
    md_spectrum_documents = create_md_spectrum_documents(
        md_documents, spectrums_processed, set_white_listed_mds,
        set_chosen_mds, c_multiply, cmd.punish_intensities,
        cmd.require_in_count)
    spec_docs_mds_file = os.path.join(cmd.output_dir,
def main():
    parser = argparse.ArgumentParser(description='Creating Spec2Vec Pairs')
    parser.add_argument('input_mgf', help='input_mgf')
    parser.add_argument('output_pairs', help='output_pairs')
    parser.add_argument('model_file', help='model_file')
    parser.add_argument('--min_score',
                        type=float,
                        default=0.7,
                        help='model_file')
    args = parser.parse_args()

    spectra = load_from_mgf(args.input_mgf)

    filtered_spectra = [post_process(s) for s in spectra]

    # Omit spectrums that didn't qualify for analysis
    filtered_spectra = [s for s in filtered_spectra if s is not None]

    # Create spectrum documents
    query_documents = [
        SpectrumDocument(s, n_decimals=2) for s in filtered_spectra
    ]

    #DEBUG
    #query_documents = query_documents[:100]

    # Loading the model
    model = gensim.models.Word2Vec.load(args.model_file)

    # Define similarity_function
    spec2vec = Spec2Vec(model=model,
                        intensity_weighting_power=0.5,
                        allowed_missing_percentage=80.0)

    print("total documents", len(query_documents))
    scores = calculate_scores(query_documents, query_documents,
                              spec2vec).scores

    number_of_spectra = len(query_documents)

    output_scores_list = []
    for i in range(number_of_spectra):
        for j in range(number_of_spectra):
            if i <= j:
                continue

            i_spectrum = filtered_spectra[i]
            j_spectrum = filtered_spectra[j]

            sim = scores[i][j]

            if sim < args.min_score:
                continue

            score_dict = {}
            score_dict["filename"] = args.input_mgf
            score_dict["CLUSTERID1"] = i_spectrum.metadata["scans"]
            score_dict["CLUSTERID2"] = j_spectrum.metadata["scans"]
            score_dict["Cosine"] = sim
            score_dict["mz1"] = i_spectrum.metadata["pepmass"][0]
            score_dict["mz2"] = j_spectrum.metadata["pepmass"][0]
            score_dict["DeltaMZ"] = score_dict["mz2"] - score_dict["mz1"]
            score_dict["EdgeAnnotation"] = "Spec2Vec"

            output_scores_list.append(score_dict)

    # Saving Data Out
    pd.DataFrame(output_scores_list).to_csv(args.output_pairs,
                                            sep="\t",
                                            index=False)
Example #15
0
def test_user_workflow_spec2vec_parallel():
    """Test typical user workflow to get from mass spectra to spec2vec similarities.

    This test will run a typical workflow example using a small dataset and a
    pretrained word2vec model. One main aspect of this is to test if users will
    get exactly the same spec2vec similarity scores when starting from a word2vec
    model that was trained and saved elsewhere.
    """
    def apply_my_filters(s):
        """This is how a user would typically design his own pre- and post-
        processing pipeline."""
        s = default_filters(s)
        s = add_parent_mass(s)
        s = normalize_intensities(s)
        s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
        s = select_by_mz(s, mz_from=0, mz_to=1000)
        s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
        s = require_minimum_number_of_peaks(s, n_required=5)
        return s

    repository_root = os.path.join(os.path.dirname(__file__), "..")
    spectrums_file = os.path.join(repository_root, "tests", "pesticides.mgf")

    # apply my filters to the data
    spectrums = [apply_my_filters(s) for s in load_from_mgf(spectrums_file)]

    # omit spectrums that didn't qualify for analysis
    spectrums = [s for s in spectrums if s is not None]

    documents = [SpectrumDocument(s) for s in spectrums]

    model_file = os.path.join(repository_root, "integration-tests",
                              "test_user_workflow_spec2vec.model")
    if os.path.isfile(model_file):
        model = gensim.models.Word2Vec.load(model_file)
    else:
        # create and train model
        model = gensim.models.Word2Vec([d.words for d in documents],
                                       size=10,
                                       min_count=1)
        model.train([d.words for d in documents],
                    total_examples=len(documents),
                    epochs=20)
        model.save(model_file)

    # define similarity_function
    spec2vec = Spec2VecParallel(model=model, intensity_weighting_power=0.5)

    references = documents[:26]
    queries = documents[25:]

    # calculate scores on all combinations of references and queries
    scores = list(calculate_scores_parallel(references, queries, spec2vec))

    # filter out self-comparisons
    filtered = [(reference, query, score)
                for (reference, query, score) in scores if reference != query]

    sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

    actual_top10 = sorted_by_score[:10]

    expected_top10 = [(documents[19], documents[25],
                       pytest.approx(0.9999121928249473, rel=1e-9)),
                      (documents[20], documents[25],
                       pytest.approx(0.9998846890269892, rel=1e-9)),
                      (documents[20], documents[45],
                       pytest.approx(0.9998756073673759, rel=1e-9)),
                      (documents[25], documents[45],
                       pytest.approx(0.9998750427994474, rel=1e-9)),
                      (documents[19], documents[27],
                       pytest.approx(0.9998722768460854, rel=1e-9)),
                      (documents[22], documents[27],
                       pytest.approx(0.9998633023352553, rel=1e-9)),
                      (documents[18], documents[27],
                       pytest.approx(0.9998616961532616, rel=1e-9)),
                      (documents[19], documents[45],
                       pytest.approx(0.9998528723697396, rel=1e-9)),
                      (documents[14], documents[71],
                       pytest.approx(0.9998404364805897, rel=1e-9)),
                      (documents[20], documents[27],
                       pytest.approx(0.9998336807761137, rel=1e-9))]

    assert actual_top10 == expected_top10, "Expected different top 10 table."
Example #16
0
    s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
    s = require_minimum_number_of_peaks(s, n_required=5)
    return s


# path_root = os.path.dirname(os.getcwd())
# path_data = os.path.join(os.path.dirname(path_root), "data/gnps_15_12_2021/")
path_data = "C:\\HSD\\OneDrive - Hochschule Düsseldorf\\Data\\ms2query"

training_spectra_annotated = load_pickled_file(os.path.join(path_data,
                                                            "GNPS_15_12_2021_pos_train.pickle"))
training_spectra_not_annotated = load_pickled_file(os.path.join(path_data,
                                                                "ALL_GNPS_15_12_2021_positive_not_annotated.pickle"))
all_spectra = training_spectra_annotated + training_spectra_not_annotated
# Load data from pickled file and apply filters
cleaned_spectra = [spectrum_processing(s) for s in all_spectra]

# Omit spectrums that didn't qualify for analysis
cleaned_spectra = [s for s in cleaned_spectra if s is not None]

# Create spectrum documents
reference_documents = [SpectrumDocument(s, n_decimals=2) for s in cleaned_spectra]

model_file = os.path.join(path_data, "trained_models",
                          "spec2vec_model_GNPS_15_12_2021.model")
model = train_new_word2vec_model(reference_documents,
                                 iterations=[10, 20, 30],
                                 filename=model_file,
                                 workers=4,
                                 progress_logger=True)