Esempio n. 1
0
def find_matches_in_dataset(dataset_id, input_spectrum_collection):
    dataset_match_list = []
    path_to_clustered_mgf = os.path.join(PATH_TO_DATASET_UPLOADS, dataset_id,
                                         "clustered",
                                         dataset_id + "_specs_ms.mgf")
    relative_user_path_to_clustered = os.path.join(
        dataset_id, "clustered", dataset_id + "_specs_ms.mgf")

    if not ming_fileio_library.is_path_present(path_to_clustered_mgf):
        return dataset_match_list

    #Lets compare these two files
    # input_spectra_filename and symlink_destination
    dataset_clustered_spectra = ming_spectrum_library.SpectrumCollection(
        path_to_clustered_mgf)
    dataset_clustered_spectra.load_from_file()

    for myspectrum in input_spectrum_collection.spectrum_list:
        match_list = dataset_clustered_spectra.search_spectrum(
            myspectrum, 1.0, 1.0, 6, 0.7, 1)
        for match in match_list:
            match.filename = relative_user_path_to_clustered
        dataset_match_list += match_list

    print "Dataset matches: " + str(len(dataset_match_list))

    return dataset_match_list
Esempio n. 2
0
def filtering_out_high_scoring_decoys(input_decoy_psms, input_target_psms, target_filename, other_filename):
    input_decoy_psms = sorted(input_decoy_psms, key=lambda psm: psm.sorting_value(), reverse=True)

    print(target_filename, other_filename)

    target_collection = ming_spectrum_library.SpectrumCollection(target_filename)
    target_collection.load_from_mzXML(drop_ms1=True)
    decoy_collection = ming_spectrum_library.SpectrumCollection(other_filename)
    decoy_collection.load_from_mzXML(drop_ms1=True)

    top_scoring_precursor_target = {}
    for psm in input_target_psms:
        annotation = psm.annotation
        charge = psm.charge
        key = annotation + "." + str(charge)
        top_psm = psm
        if key in top_scoring_precursor_target:
            top_psm = top_scoring_precursor_target[key]
        else:
            top_scoring_precursor_target[key] = psm

        if psm.score > top_psm.score:
            top_scoring_precursor_target[key] = psm

    output_decoys_list = []
    for psm in input_decoy_psms[:200]:
        annotation = psm.annotation
        charge = psm.charge
        key = annotation + "." + str(charge)
        if key in top_scoring_precursor_target:
            print(key, psm.score, psm.scan)
            print(annotation, annotation[:-2])
            decoy_spectrum = decoy_collection.scandict[int(psm.scan)]
            target_spectrum = target_collection.scandict[int(top_scoring_precursor_target[key].scan)]
            cosine_score = spectrum_alignment.score_alignment_annotated_ion_peaks(decoy_spectrum.peaks, target_spectrum.peaks, 0, 0, 0.1, annotation, annotation, min(3, charge))
            print(cosine_score)
            if cosine_score < 0.7:
                output_decoys_list.append(psm)
        else:
            output_decoys_list.append(psm)

    for psm in input_decoy_psms[200:]:
        output_decoys_list.append(psm)

    return output_decoys_list
Esempio n. 3
0
def main():
    input_folder = sys.argv[1]

    input_protein_fdr_filename = sys.argv[2]
    input_peptide_protein_mapping_filename = sys.argv[3]

    precursor_to_protein_map = load_precursor_to_protein_mapping(
        input_peptide_protein_mapping_filename)
    included_proteins = proteins_to_include(input_protein_fdr_filename)

    output_mgf_folder = sys.argv[4]
    output_tsv_folder = sys.argv[5]
    output_sptxt_folder = sys.argv[6]

    output_filename_prefix = sys.argv[7]

    input_files = ming_fileio_library.list_files_in_dir(input_folder)

    all_library_spectra = []
    for input_filename in input_files:
        temp_spectra = ming_spectrum_library.load_mgf_peptide_library(
            input_filename)
        print("loaded ", len(temp_spectra), "from", input_filename)
        for spectrum in temp_spectra:
            peptide = spectrum.peptide
            protein = spectrum.protein

            #All Proteins
            new_proteins_set = set(
                precursor_to_protein_map[peptide].split(";"))
            if len(new_proteins_set.intersection(included_proteins)) == 0:
                continue

            if protein != "CREATION_FALSE_PROTEIN":
                spectrum.protein = precursor_to_protein_map[peptide]

            all_library_spectra.append(spectrum)

    library_spectrum_collection_split = ming_spectrum_library.SpectrumCollection(
        "library spectra")
    library_spectrum_collection_split.spectrum_list = all_library_spectra

    output_tsv_filename = os.path.join(output_tsv_folder,
                                       output_filename_prefix + ".tsv")
    output_mgf_filename = os.path.join(output_mgf_folder,
                                       output_filename_prefix + ".mgf")
    output_sptxt_filename = os.path.join(output_mgf_folder,
                                         output_filename_prefix + ".sptxt")

    library_spectrum_collection_split.save_to_mgf(
        open(output_mgf_filename, "w"))
    library_spectrum_collection_split.save_to_tsv(
        open(output_tsv_filename, "w"), output_mgf_filename)

    library_spectrum_collection_split.save_to_sptxt(
        open(output_sptxt_filename, "w"))
Esempio n. 4
0
def finding_matches_in_public_data(input_spectra_filename, all_datasets):

    all_matches_to_datasets_map = {}

    input_spectrum_collection = ming_spectrum_library.SpectrumCollection(
        input_spectra_filename)
    input_spectrum_collection.load_from_file()

    total_matches = 0

    search_parameters = []
    for dataset in all_datasets:
        if dataset["title"].upper().find("GNPS") == -1:
            continue
        dataset_id = dataset["dataset"]
        search_parameters.append({
            "dataset_id":
            dataset_id,
            "input_spectrum_collection":
            input_spectrum_collection
        })

    #Doing search serial
    search_results = []
    for search_param in search_parameters:
        if total_matches > 0:
            continue

        dataset_matches = find_matches_in_dataset_wrapper(search_param)
        search_results.append(dataset_matches)

        total_matches += len(dataset_matches)

        print "SEARCHING " + str(search_param)

    print("datasets to consider: " + str(len(search_parameters)))

    #Parallel
    #search_results = ming_parallel_library.run_parallel_job(find_matches_in_dataset_wrapper, search_parameters, 10)

    #formatting output
    for i in range(len(search_results)):
        dataset_matches = search_results[i]
        dataset_id = search_parameters[i]["dataset_id"]

        print "outputting: " + str(search_parameters[i])

        all_matches_to_datasets_map[dataset_id] = {"matches": dataset_matches}

    return all_matches_to_datasets_map
def find_matches_in_file(input_spectrum_collection,
                         dataset_filepath,
                         relative_dataset_filepath,
                         match_parameters,
                         top_k=1):
    dataset_match_list = []

    if not ming_fileio_library.is_path_present(dataset_filepath):
        print("Cant find", dataset_filepath)
        return dataset_match_list

    dataset_query_spectra = ming_spectrum_library.SpectrumCollection(
        dataset_filepath)
    try:
        dataset_query_spectra.load_from_file()
    except:
        return dataset_match_list

    for repo_spectrum in dataset_query_spectra.spectrum_list:
        if match_parameters["FILTER_WINDOW"]:
            repo_spectrum.window_filter_peaks(50, 6)
        if match_parameters["FILTER_PRECURSOR"]:
            repo_spectrum.filter_precursor_peaks()

    for myspectrum in input_spectrum_collection.spectrum_list:
        if match_parameters["FILTER_WINDOW"]:
            myspectrum.window_filter_peaks(50, 6)
        if match_parameters["FILTER_PRECURSOR"]:
            myspectrum.filter_precursor_peaks()

        try:
            match_list = dataset_query_spectra.search_spectrum(
                myspectrum,
                match_parameters["PM_TOLERANCE"],
                match_parameters["FRAGMENT_TOLERANCE"],
                match_parameters["MIN_MATCHED_PEAKS"],
                match_parameters["MIN_COSINE"],
                analog_search=match_parameters["ANALOG_SEARCH"],
                top_k=top_k)
            for match in match_list:
                match["filename"] = relative_dataset_filepath
            dataset_match_list += match_list
        except:
            print("Error in Matching")

    print("Dataset matches: " + str(len(dataset_match_list)))

    return dataset_match_list
Esempio n. 6
0
def find_matches_in_dataset(dataset_id, input_spectrum_collection,
                            identification_map):
    dataset_match_list = []
    path_to_peak_collection = os.path.join(PATH_TO_DATASET_UPLOADS, dataset_id,
                                           "peak")
    peak_files = ming_fileio_library.list_files_in_dir(path_to_peak_collection)

    for input_file in peak_files:
        print(input_file)
        relative_user_path_to_file = os.path.relpath(input_file,
                                                     PATH_TO_DATASET_UPLOADS)
        reference_spectra = ming_spectrum_library.SpectrumCollection(
            input_file)
        reference_spectra.load_from_mzXML(drop_ms1=True)

        is_blank = 0
        if input_file.find("blank") != -1:
            is_blank = 1

        for myspectrum in input_spectrum_collection.spectrum_list:

            match_list = reference_spectra.search_spectrum(
                myspectrum, 1.0, 1.0, 4, 0.7, 1)
            for match in match_list:
                match_obj = {}
                match_obj["filename"] = relative_user_path_to_file
                match_obj["scan"] = match.scan
                match_obj["score"] = match.score
                match_obj["query_filename"] = match.query_filename
                match_obj["query_scan"] = match.query_scan
                match_obj["ppm_error"] = match.ppm_error
                match_obj["is_blank"] = is_blank
                match_obj["dataset_id"] = dataset_id

                #compound identification
                if match.scan in identification_map:
                    match_obj["identification"] = identification_map[
                        match.scan]["identification"]
                    match_obj["spectrum_id"] = identification_map[
                        match.scan]["spectrum_id"]
                else:
                    match_obj["identification"] = ""
                    match_obj["spectrum_id"] = ""

                dataset_match_list.append(match_obj)

    return dataset_match_list
def get_spectrum_collection_from_param_obj(param_obj):
    precursor_mz = float(param_obj["precursor_mz"][0])
    spectrum_string = param_obj["spectrum_string"][0]
    peaks_lines = spectrum_string.split("\n")
    peak_list = []
    for peak_line in peaks_lines:
        splits = peak_line.split()
        mass = float(splits[0])
        intensity = float(splits[1])
        peak_list.append([mass, intensity])

    peak_list = sorted(peak_list, key=lambda peak: peak[0])

    spectrum_obj = ming_spectrum_library.Spectrum("search_spectrum.mgf", 1, 0, peak_list, precursor_mz, 1, 2)
    spectrum_collection = ming_spectrum_library.SpectrumCollection("search_spectrum.mgf")

    spectrum_collection.spectrum_list = [spectrum_obj]

    return spectrum_collection
Esempio n. 8
0
def main():
    input_intermediate_file = sys.argv[1]
    output_tsv_folder = sys.argv[2]
    output_mgf_folder = sys.argv[3]
    output_sptxt_folder = sys.argv[4]

    all_input_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder)

    library_spectrum_collection = ming_spectrum_library.SpectrumCollection("library spectra")

    all_json_spectra_list = json.load(open(input_intermediate_file))
    print("Loaded", input_intermediate_file, len(all_json_spectra_list))
    for library_spectrum in list_of_library_spectra:
        lib_spec = ming_spectrum_library.PeptideLibrarySpectrum("", 0, 0, library_spectrum["peaks"], library_spectrum["mz"], library_spectrum["charge"], library_spectrum["annotation"], library_spectrum["protein"])
        if "score" in library_spectrum:
            lib_spec.score = library_spectrum["score"]
        if "variant_score" in library_spectrum:
            lib_spec.variant_score = library_spectrum["variant_score"]
        if "spectra_to_consider" in library_spectrum:
            lib_spec.num_spectra = library_spectrum["spectra_to_consider"]
        if "ranking" in library_spectrum:
            lib_spec.spectrum_ranking = library_spectrum["ranking"]
        if "proteosafe_task" in library_spectrum:
            lib_spec.proteosafe_task = library_spectrum["proteosafe_task"]
        if "originalspectrum_filename" in library_spectrum:
            lib_spec.originalfile_filename = library_spectrum["originalspectrum_filename"]
        if "originalspectrum_scan" in library_spectrum:
            lib_spec.originalfile_scan = str(library_spectrum["originalspectrum_scan"])

        library_spectrum_collection.spectrum_list.append(lib_spec)

    output_mgf_filename = os.path.join(output_mgf_folder, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".mgf")
    output_tsv_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".tsv")
    output_sptxt_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".sptxt")

    library_spectrum_collection_split.save_to_mgf(open(output_mgf_filename, "w"))
    library_spectrum_collection_split.save_to_tsv(open(output_tsv_filename, "w"), output_mgf_filename)

    try:
        library_spectrum_collection.save_to_sptxt(open(output_sptxt_filename, "w"))
    except:
        traceback.print_exc(file=sys.stdout)
        print("MEH")
def main():
    input_folder = sys.argv[1]

    input_protein_fdr_filename = sys.argv[2]
    input_peptide_protein_mapping_filename = sys.argv[3]

    precursor_to_protein_map = load_precursor_to_protein_mapping(
        input_peptide_protein_mapping_filename)

    output_mgf_folder = sys.argv[4]
    output_tsv_folder = sys.argv[5]

    output_filename_prefix = sys.argv[6]

    input_files = ming_fileio_library.list_files_in_dir(input_folder)

    all_library_spectra = []
    for input_filename in input_files:
        temp_spectra = ming_spectrum_library.load_mgf_peptide_library(
            input_filename)
        print("loaded ", len(temp_spectra), "from", input_filename)
        for spectrum in temp_spectra:
            peptide = spectrum.peptide
            protein = spectrum.protein
            if protein == "CREATION_FALSE_PROTEIN":
                continue
            spectrum.protein = precursor_to_protein_map[peptide]
        all_library_spectra += temp_spectra

    library_spectrum_collection_split = ming_spectrum_library.SpectrumCollection(
        "library spectra")
    library_spectrum_collection_split.spectrum_list = all_library_spectra

    output_tsv_filename = os.path.join(output_tsv_folder,
                                       output_filename_prefix + ".tsv")
    output_mgf_filename = os.path.join(output_mgf_folder,
                                       output_filename_prefix + ".mgf")

    library_spectrum_collection_split.save_to_mgf(
        open(output_mgf_filename, "w"))
    library_spectrum_collection_split.save_to_tsv(
        open(output_tsv_filename, "w"), output_mgf_filename)
Esempio n. 10
0
def finding_matches_in_public_data(input_spectra_filename, all_datasets,
                                   identification_dict):
    input_spectrum_collection = ming_spectrum_library.SpectrumCollection(
        input_spectra_filename)
    input_spectrum_collection.load_from_file()

    total_matches = 0

    search_parameters = []
    for dataset in all_datasets:
        dataset_id = dataset["dataset"]
        search_parameters.append({
            "dataset_id": dataset_id,
            "input_spectrum_collection": input_spectrum_collection,
            "identification_dict": identification_dict
        })

    #Doing search serial
    search_results = []
    for search_param in search_parameters:
        #if total_matches > 0:
        #    continue
        print("SEARCHING " + str(search_param))
        dataset_matches = find_matches_in_dataset_wrapper(search_param)
        search_results.append(dataset_matches)
        total_matches += len(dataset_matches)

    print("datasets to consider: " + str(len(search_parameters)))

    #Parallel
    #search_results = ming_parallel_library.run_parallel_job(find_matches_in_dataset_wrapper, search_parameters, 10)

    #formatting output
    all_matches = []
    for i in range(len(search_results)):
        dataset_matches = search_results[i]

        print "outputting: " + str(search_parameters[i])

        all_matches += dataset_matches
    return all_matches
def get_file_stats(input_filename):
    output_list = []

    spectrum_collection = ming_spectrum_library.SpectrumCollection(
        input_filename)

    try:
        spectrum_collection.load_from_file(drop_ms1=True)
    except KeyboardInterrupt:
        raise
    except:
        print("Cannot load", input_filename)

    for spectrum in spectrum_collection.spectrum_list:
        output_dict = {}
        output_dict["fragmentation"] = spectrum.fragmenation_method
        output_dict["filename"] = os.path.basename(input_filename)
        output_dict["collision_energy"] = spectrum.collision_energy
        output_dict["scan"] = spectrum.scan
        output_list.append(output_dict)

    return output_list
Esempio n. 12
0
import sys
import getopt
import argparse
import ming_spectrum_library

parser = argparse.ArgumentParser(description='Filter Spectra')
parser.add_argument('input_mgf_filename', help='input_mgf_filename')
parser.add_argument('output_mgf_filename', help='output_mgf_filename')
parser.add_argument('output_filtered_mgf_filename',
                    help='output_filtered_mgf_filename')
parser.add_argument('--FILTER_PRECURSOR_WINDOW', help='0', default=None)
parser.add_argument('--WINDOW_FILTER', help='0', default=None)
args = parser.parse_args()

# Load the spectra
spectrum_collection = ming_spectrum_library.SpectrumCollection(
    args.input_mgf_filename)
spectrum_collection.load_from_mgf()

# Making sure to renumber
spectrum_list = sorted(spectrum_collection.spectrum_list,
                       key=lambda spectrum: int(spectrum.scan))

included_scans = set()
spectrum_dict = {}
for spectrum in spectrum_list:
    spectrum_dict[int(spectrum.scan)] = spectrum
    included_scans.add(int(spectrum.scan))

max_scan = max(included_scans)

output_spectrum_list = []
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('consensus_feature_file',
                        help='Consensus Quantification File')
    parser.add_argument('metadata_folder', help='metadata metadata_folder')
    parser.add_argument('mgf_filename', help='mgf_filename')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    task_id = param_obj["task"][0]

    group_to_files_mapping = defaultdict(list)
    attributes_to_groups_mapping = defaultdict(set)

    metadata_files = glob.glob(os.path.join(args.metadata_folder, "*"))
    if len(metadata_files) == 1:
        group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(
            metadata_files[0])

    ROW_NORMALIZATION = "None"
    try:
        ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0]
    except:
        ROW_NORMALIZATION = "None"

    GROUP_COUNT_AGGREGATE_METHOD = "Mean"
    try:
        GROUP_COUNT_AGGREGATE_METHOD = param_obj[
            "GROUP_COUNT_AGGREGATE_METHOD"][0]
    except:
        GROUP_COUNT_AGGREGATE_METHOD = "Mean"

    quantification_df = pd.read_csv(args.consensus_feature_file)
    quantification_list = quantification_df.to_dict(orient="records")

    input_filenames, input_filename_headers = determine_input_files(
        quantification_list[0].keys())

    ### Filling in Quantification table if it is missing values
    for quantification_object in quantification_list:
        ###Handling empty quantification
        for filename in input_filename_headers:
            try:
                if len(quantification_object[filename]) == 0:
                    #print(filename, quantification_object[filename], quantification_object["row ID"])
                    quantification_object[filename] = 0
            except:
                x = 1

    print("Number of Features", len(quantification_list))

    #Doing row sum normalization
    if ROW_NORMALIZATION == "RowSum":
        print("ROW SUM NORM")
        for filename_header in input_filename_headers:
            file_quants = [
                float(quantification_object[filename_header])
                for quantification_object in quantification_list
            ]
            summed_file_quants = sum(file_quants)
            #Handling zero column
            if summed_file_quants > 0:
                for quantification_object in quantification_list:
                    quantification_object[filename_header] = float(
                        quantification_object[filename_header]) / sum(
                            file_quants) * 1000000
    """Loading MS2 Spectra"""
    mgf_collection = ming_spectrum_library.SpectrumCollection(
        args.mgf_filename)
    mgf_collection.load_from_file()

    clusters_list = []
    for quantification_object in quantification_list:

        cluster_obj = {}
        cluster_obj["cluster index"] = str(int(
            quantification_object["row ID"]))
        cluster_obj["precursor mass"] = "{0:.4f}".format(
            float(quantification_object["row m/z"]))
        cluster_obj["RTConsensus"] = "{0:.4f}".format(
            float(quantification_object["row retention time"]))

        all_charges = []
        """Checking about the charge of this cluster"""
        try:
            spectrum_object = mgf_collection.scandict[int(
                cluster_obj["cluster index"])]
            charge = int(spectrum_object.charge)
        except:
            charge = 0
        """Checking if this spectrum has no peaks"""
        # try:
        #     spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
        #
        # except:
        #     continue

        all_files = [
            os.path.basename(filename) for filename in input_filename_headers
            if float(quantification_object[filename]) > 0
        ]
        abundance_per_file = [(os.path.basename(filename),
                               float(quantification_object[filename]))
                              for filename in input_filename_headers]
        all_abundances = [
            float(quantification_object[filename])
            for filename in input_filename_headers
        ]

        if charge != 0:
            cluster_obj["parent mass"] = "{0:.4f}".format(
                float(quantification_object["row m/z"]) * charge - charge + 1)
        else:
            cluster_obj["parent mass"] = "{0:.4f}".format(
                float(quantification_object["row m/z"]))
        cluster_obj["precursor charge"] = charge

        try:
            cluster_obj["RTMean"] = statistics.mean(all_retention_times)
            cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times)
        except:
            cluster_obj["RTMean"] = cluster_obj["RTConsensus"]
            cluster_obj["RTStdErr"] = 0

        cluster_obj[
            "GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (
                task_id, quantification_object["row ID"],
                quantification_object["row ID"])
        cluster_obj["sum(precursor intensity)"] = sum(all_abundances)
        cluster_obj["SumPeakIntensity"] = sum(all_abundances)
        cluster_obj["number of spectra"] = len(all_files)
        cluster_obj["UniqueFileSourcesCount"] = len(all_files)

        group_abundances = determine_group_abundances(
            group_to_files_mapping,
            abundance_per_file,
            operation=GROUP_COUNT_AGGREGATE_METHOD)

        default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"]
        for group in group_to_files_mapping:
            group_header = "GNPSGROUP:" + group
            if group in default_groups:
                continue
            cluster_obj[group_header] = group_abundances[group]

        for group in default_groups:
            cluster_obj[group] = group_abundances[group]

        #Writing attributes
        for attribute in attributes_to_groups_mapping:
            groups_to_include = []
            for group in attributes_to_groups_mapping[attribute]:
                if group_abundances[group] > 0.0:
                    groups_to_include.append(group)
            if len(groups_to_include) == 0:
                cluster_obj[attribute] = ""
            else:
                cluster_obj[attribute] = ",".join(groups_to_include)
        """
        Enriching the cluster info with adduct collapsing information
        """
        enrich_adduct_annotations(cluster_obj, quantification_object)

        clusters_list.append(cluster_obj)

    pd.DataFrame(clusters_list).to_csv(args.output_clusterinfo_summary,
                                       sep="\t",
                                       index=False)
def main():
    param_filename = sys.argv[1]
    choose_consensus_params_filename = sys.argv[2]
    filtered_peptide_list_filename = sys.argv[3]
    length_score_cutoff_filename = sys.argv[4]
    provenance_json_filename = sys.argv[5]
    merged_library_spectra_folder = sys.argv[6]

    output_library_json_folder = sys.argv[7]
    output_candidate_spectra_tsv_folder = sys.argv[8]

    filtered_peptide_set = load_filtered_peptide_set(
        filtered_peptide_list_filename)
    score_cutoff_by_length = load_score_cutoff_by_length(
        filtered_peptide_list_filename)
    variant_to_score = load_variant_to_score(filtered_peptide_list_filename)

    #Deciding on how to create consensus
    params_obj = ming_proteosafe_library.parse_xml_file(open(param_filename))

    parallel_params = json.loads(open(choose_consensus_params_filename).read())
    total_node_count = parallel_params["total_paritions"]
    my_node_number = parallel_params["node_partition"]

    consensus_selection_method = params_obj["ConsensusChoice"][0]

    #output dict for listing all candidates
    library_candidates_output_dict = defaultdict(list)

    #determine filenames
    merged_library_filename, my_position_for_file, total_nodes_for_file = determine_filenames_to_load(
        my_node_number, total_node_count, merged_library_spectra_folder)

    print(merged_library_filename, my_position_for_file, total_nodes_for_file)

    library_spectra = []

    input_spectrum_file_handle = open(merged_library_filename)
    line_count = 0
    for line in input_spectrum_file_handle:
        line_count += 1
        if line_count % total_nodes_for_file != my_position_for_file:
            #print("Should Skip")
            continue
        else:
            print("NOT SKIP")

        all_spectra = json.loads(line)

        if len(all_spectra) == 0:
            continue

        annotation = all_spectra[0]["annotation"] + "." + str(
            all_spectra[0]["charge"])
        print(annotation, len(all_spectra))

        if not annotation in filtered_peptide_set:
            continue

        library_spectrum = create_library_spectrum(
            all_spectra, consensus_selection_method, score_cutoff_by_length,
            variant_to_score, library_candidates_output_dict)
        library_spectra.append(library_spectrum)

    json.dump(
        library_spectra,
        open(
            os.path.join(output_library_json_folder,
                         str(my_node_number) + ".json"), "w"))

    #Provenance Records
    provenance_records = json.loads(open(provenance_json_filename).read())

    #Modifying the output candidate file
    for i in range(len(library_candidates_output_dict["filename"])):
        proteosafe_task = library_candidates_output_dict["proteosafe_task"][i]
        if proteosafe_task in provenance_records["search_task_to_augment"]:
            library_candidates_output_dict["augment_task"].append(
                provenance_records["search_task_to_augment"][proteosafe_task])
        else:
            library_candidates_output_dict["augment_task"].append("")

        if proteosafe_task in provenance_records["search_task_to_extraction"]:
            library_candidates_output_dict["extract_task"].append(
                provenance_records["search_task_to_extraction"]
                [proteosafe_task])
        else:
            library_candidates_output_dict["extract_task"].append("")

    #Outputting
    output_candidate_spectra_tsv_filename = os.path.join(
        output_candidate_spectra_tsv_folder,
        str(my_node_number) + ".tsv")
    ming_fileio_library.write_dictionary_table_data(
        library_candidates_output_dict, output_candidate_spectra_tsv_filename)
    """Converted Output"""
    output_tsv_folder = sys.argv[9]
    output_mgf_folder = sys.argv[10]
    output_sptxt_folder = sys.argv[11]

    library_spectrum_collection = ming_spectrum_library.SpectrumCollection(
        "library spectra")

    for library_spectrum in library_spectra:
        lib_spec = ming_spectrum_library.PeptideLibrarySpectrum(
            "", 0, 0, library_spectrum["peaks"], library_spectrum["mz"],
            library_spectrum["charge"], library_spectrum["annotation"],
            library_spectrum["protein"])
        if "score" in library_spectrum:
            lib_spec.score = library_spectrum["score"]
        if "variant_score" in library_spectrum:
            lib_spec.variant_score = library_spectrum["variant_score"]
        if "spectra_to_consider" in library_spectrum:
            lib_spec.num_spectra = library_spectrum["spectra_to_consider"]
        if "ranking" in library_spectrum:
            lib_spec.spectrum_ranking = library_spectrum["ranking"]
        if "proteosafe_task" in library_spectrum:
            lib_spec.proteosafe_task = library_spectrum["proteosafe_task"]
        if "originalspectrum_filename" in library_spectrum:
            lib_spec.originalfile_filename = library_spectrum[
                "originalspectrum_filename"]
        if "originalspectrum_scan" in library_spectrum:
            lib_spec.originalfile_scan = str(
                library_spectrum["originalspectrum_scan"])

        library_spectrum_collection.spectrum_list.append(lib_spec)

    output_mgf_filename = os.path.join(output_mgf_folder,
                                       str(my_node_number) + ".mgf")
    output_tsv_filename = os.path.join(output_tsv_folder,
                                       str(my_node_number) + ".tsv")
    output_sptxt_filename = os.path.join(output_sptxt_folder,
                                         str(my_node_number) + ".sptxt")

    library_spectrum_collection.save_to_mgf(open(output_mgf_filename, "w"))
    library_spectrum_collection.save_to_tsv(open(output_tsv_filename, "w"),
                                            output_mgf_filename)

    try:
        library_spectrum_collection.save_to_sptxt(
            open(output_sptxt_filename, "w"))
    except:
        traceback.print_exc(file=sys.stdout)
        print("MEH")
Esempio n. 15
0
files_to_filter = []
for line in open(args.groupsfilename):
    if line.split("=")[0] == "GROUP_G6":
        files_to_filter = line.split("=")[1].split(";")
        files_to_filter = [filename.rstrip() for filename in files_to_filter]

print("files_to_filter", files_to_filter)

filtered_clusterinfo_df = clusterinfo_df[clusterinfo_df["#Filename"].isin(
    files_to_filter)]
clusters_to_filter = set(list(filtered_clusterinfo_df["#ClusterIdx"]))

print("clusters_to_filter", clusters_to_filter)

#Loading the spectra
spectrum_collection = ming_spectrum_library.SpectrumCollection(input_specs_ms)
spectrum_collection.load_from_mgf()

filtered_spectrum_list = []
for spectrum in spectrum_collection.spectrum_list:
    if spectrum is None:
        continue
    else:
        scan = spectrum.scan
        if scan in clusters_to_filter:
            continue
        else:
            filtered_spectrum_list.append(spectrum)

#Renumbering to make sure empty ones are still there
included_scans = set()
def calculated_ambiguity(parameter_map, peak_tolerance):
    filename = parameter_map["filename"]
    scan_mapping = parameter_map["scan_mapping"]

    spectrum_collection = ming_spectrum_library.SpectrumCollection(filename)
    spectrum_collection.load_from_file()

    return_ambiguity_mapping = defaultdict(lambda: {})

    for scan in scan_mapping:
        spectrum_obj = spectrum_collection.scandict[int(scan)]
        #Lets determine if the strings are actually ambiguous
        ambiguous_list = ming_ambiguity_library.collapse_ambiguous_from_annotations_list(
            scan_mapping[scan])
        #print(ambiguous_list)
        if len(ambiguous_list) == 1:
            score_summary = {}
            score_summary["ambiguity_total_score"] = -1
            score_summary["first_unique_count"] = -1
            score_summary["second_unique_count"] = -1
            score_summary["first_unique_intensity"] = -1
            score_summary["second_unique_intensity"] = -1
            score_summary["first_second_unique_ratio"] = -1

            return_ambiguity_mapping[scan] = score_summary

            continue

        if len(ambiguous_list) > 2:
            score_summary = {}
            score_summary["ambiguity_total_score"] = 10
            score_summary["first_unique_count"] = 10
            score_summary["second_unique_count"] = 10
            score_summary["first_unique_intensity"] = 10
            score_summary["second_unique_intensity"] = 10
            score_summary["first_second_unique_ratio"] = -1

            return_ambiguity_mapping[scan] = score_summary
            continue

        peptide_to_extracted_peaks_mapping = {}
        for peptide in ambiguous_list:
            theoreteical_peaks = ming_psm_library.create_theoretical_peak_map(
                peptide, ["b", "y"])
            original_peaks = spectrum_obj.peaks
            extracted_peaks = extract_annotated_peaks(theoreteical_peaks,
                                                      original_peaks,
                                                      peak_tolerance)
            peptide_to_extracted_peaks_mapping[peptide] = extracted_peaks

            #print("Original:\t%d\tExtracted:\t%d" % (len(original_peaks), len(extracted_peaks)))
            #print(original_peaks)
            #print(extracted_peaks)
            #print(theoreteical_peaks)

        #Checkout overlap of stuff
        first_peaks = peptide_to_extracted_peaks_mapping[list(
            peptide_to_extracted_peaks_mapping.keys())[0]]
        second_peaks = peptide_to_extracted_peaks_mapping[list(
            peptide_to_extracted_peaks_mapping.keys())[1]]
        total_score, reported_alignments = spectrum_alignment.score_alignment(
            first_peaks, second_peaks, spectrum_obj.mz, spectrum_obj.mz,
            peak_tolerance)

        first_total = len(first_peaks)
        second_total = len(second_peaks)
        intersection_total = len(reported_alignments)
        first_unique_count = first_total - intersection_total
        second_unique_count = second_total - intersection_total

        #Calculating the explained intensity in each of these
        peaks_1_normed = spectrum_alignment.sqrt_normalize_spectrum(
            spectrum_alignment.convert_to_peaks(first_peaks))
        peaks_2_normed = spectrum_alignment.sqrt_normalize_spectrum(
            spectrum_alignment.convert_to_peaks(second_peaks))

        first_aligned_index = []
        second_aligned_index = []

        for alignment in reported_alignments:
            first_aligned_index.append(alignment.peak1)
            second_aligned_index.append(alignment.peak2)

        #intensity values
        first_unique = []
        second_unique = []

        for i in range(len(peaks_1_normed)):
            if not i in first_aligned_index:
                first_unique.append(peaks_1_normed[i][1])

        for i in range(len(peaks_2_normed)):
            if not i in second_aligned_index:
                second_unique.append(peaks_2_normed[i][1])

        first_unique_intensity = sum(i[0] * i[1]
                                     for i in zip(first_unique, first_unique))
        second_unique_intensity = sum(
            i[0] * i[1] for i in zip(second_unique, second_unique))

        first_second_unique_ratio = 0
        try:
            first_second_unique_ratio = min(
                first_unique_intensity, second_unique_intensity) / max(
                    first_unique_intensity, second_unique_intensity)
        except KeyboardInterrupt:
            raise
        except:
            first_second_unique_ratio = 10

        if first_second_unique_ratio > 10:
            first_second_unique_ratio = 10

        #print(reported_alignments)
        #print(peaks_1_normed)
        #print("FirstCount\t%d\tSecondCount\t%d\tFirstInt\t%f\tSecondInt\t%f" % (first_unique_count, second_unique_count, first_unique_intensity, second_unique_intensity))

        score_summary = {}
        score_summary["ambiguity_total_score"] = total_score
        score_summary["first_unique_count"] = first_unique_count
        score_summary["second_unique_count"] = second_unique_count
        score_summary["first_unique_intensity"] = first_unique_intensity
        score_summary["second_unique_intensity"] = second_unique_intensity
        score_summary["first_second_unique_ratio"] = first_second_unique_ratio

        return_ambiguity_mapping[scan] = score_summary

    return return_ambiguity_mapping
Esempio n. 17
0
def extract_psms_from_filename(filename, psms_list, snr_threshold,
                               minimum_explained_intensity, min_signal_peaks,
                               min_number_of_peaks_within_1_percent_of_max,
                               min_number_of_annotated_ions, max_ppm_error):
    full_path = os.path.join(
        ming_proteosafe_library.PROTEOSAFE_USER_UPLOADS_DIR,
        filename,
    )
    print("loading ", full_path)
    spectrum_collection = ming_spectrum_library.SpectrumCollection(full_path)
    spectrum_collection.load_from_file(drop_ms1=True)
    spectrum_list = []
    for psm in psms_list:
        scan = psm.scan
        protein = "PROTEIN"
        if psm.decoy == 1:
            protein = "CREATION_FALSE_PROTEIN"

        loaded_spectrum = spectrum_collection.scandict[scan]
        loaded_spectrum.filter_precursor_peaks()
        number_of_signal_peaks = loaded_spectrum.get_number_of_signal_peaks(
            SNR_Threshold=3)
        number_of_peaks_within_1_percent_of_max = loaded_spectrum.get_number_of_peaks_within_percent_of_max(
            percent=1.0)
        number_of_peaks_within_5_percent_of_max = loaded_spectrum.get_number_of_peaks_within_percent_of_max(
            percent=5.0)
        annotated_peak_count = ming_psm_library.calculated_number_annotated_peaks(
            loaded_spectrum.peaks, loaded_spectrum.charge,
            psm.get_annotation_without_charge(), 0.1)
        explained_intensity = ming_psm_library.calculated_explained_intensity(
            loaded_spectrum.peaks, loaded_spectrum.charge,
            psm.get_annotation_without_charge(), 0.1)
        number_of_ions_annotated_above_SNR = ming_spectrum_library.calculated_number_unique_ions_annotated_in_signal(
            loaded_spectrum.peaks,
            min(loaded_spectrum.charge, 3),
            psm.get_annotation_without_charge(),
            0.1,
            SNR=3.0)

        theoretical_mz = ming_psm_library.calculate_theoretical_peptide_mass(
            psm.get_annotation_without_charge(), psm.charge)
        mass_difference = abs(theoretical_mz - loaded_spectrum.mz)
        ppm_error = (mass_difference / theoretical_mz) * 1000000
        parent_mass_error = mass_difference * psm.charge

        if snr_threshold > 0.9:
            loaded_spectrum.filter_noise_peaks(snr_threshold)

        output_spectrum_dict = {}
        output_spectrum_dict["filename"] = filename
        output_spectrum_dict["protein"] = protein
        output_spectrum_dict["scan"] = loaded_spectrum.scan
        output_spectrum_dict["peaks"] = json.dumps(loaded_spectrum.peaks)
        output_spectrum_dict["mz"] = loaded_spectrum.mz
        output_spectrum_dict["charge"] = psm.charge
        output_spectrum_dict["score"] = psm.score
        output_spectrum_dict["kl_score"] = float(
            psm.extra_metadata["kl_strict"])
        output_spectrum_dict["annotation"] = psm.get_annotation_without_charge(
        )
        output_spectrum_dict[
            "collision_energy"] = loaded_spectrum.collision_energy
        output_spectrum_dict[
            "precursor_intensity"] = loaded_spectrum.precursor_intensity
        output_spectrum_dict["signal_peaks"] = number_of_signal_peaks
        output_spectrum_dict[
            "number_of_peaks_within_1_percent_of_max"] = number_of_peaks_within_1_percent_of_max
        output_spectrum_dict[
            "number_of_peaks_within_5_percent_of_max"] = number_of_peaks_within_5_percent_of_max
        output_spectrum_dict["annotated_peak_count"] = annotated_peak_count
        output_spectrum_dict[
            "number_of_ions_annotated_above_SNR"] = number_of_ions_annotated_above_SNR
        output_spectrum_dict["explained_intensity"] = explained_intensity
        output_spectrum_dict["ppm_error"] = ppm_error
        output_spectrum_dict["parent_mass_error"] = parent_mass_error
        if "proteosafe_task" in psm.extra_metadata:
            output_spectrum_dict["proteosafe_task"] = psm.extra_metadata[
                "proteosafe_task"]
        else:
            output_spectrum_dict["proteosafe_task"] = ""

        #TODO FILTER OUT SPECTRA HERE
        if output_spectrum_dict["signal_peaks"] < min_signal_peaks:
            continue
        if output_spectrum_dict[
                "number_of_peaks_within_1_percent_of_max"] < min_number_of_peaks_within_1_percent_of_max:
            continue
        if output_spectrum_dict[
                "explained_intensity"] < minimum_explained_intensity:
            continue
        if output_spectrum_dict[
                "number_of_ions_annotated_above_SNR"] < min_number_of_annotated_ions:
            continue
        if ppm_error > max_ppm_error:
            continue

        spectrum_list.append(output_spectrum_dict)

    return spectrum_list