Example #1
0
def simple_presence_of_merged_spectra_processing(input_integrals_filename, output_clusterinfo_filename, mangled_mapping):
    extension_stripped_mangled_mapping = {}
    for key in mangled_mapping:
        without_ext = ming_fileio_library.get_filename_without_extension(key)
        extension_stripped_mangled_mapping[without_ext] = mangled_mapping[key]


    header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:]

    table_list = ming_fileio_library.parse_table_with_headers_object_list(input_integrals_filename, delimiter=",")
    #Removing other header infroamtion
    table_list = table_list[2:]

    output_dict = defaultdict(list)

    print("for zheng's sanity print the wholetable ----")
    print(table_list)
    for result_object in table_list:
        try:
            sample_name = result_object["RTS:"]
        except:
            sample_name = "unknown"
        scan_number = 0
        for header in header_order:
            scan_number += 1
            abundance = result_object[header]
            output_dict["filename"].append( sample_name )
            output_dict["abundance"].append( abundance )
            output_dict["scan_number"].append( scan_number )
            output_dict["RT"].append( header )

    ming_fileio_library.write_dictionary_table_data(output_dict, output_clusterinfo_filename)
def main():
    params = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))
    proteome = ming_protein_library.parse_fasta_proteome_file(sys.argv[2])

    row_count, table_data = ming_fileio_library.parse_table_with_headers(sys.argv[3])
    decoy_marker = sys.argv[5]

    add_decoy_to_results(table_data, row_count, decoy_marker)
    psm_results = add_fdr_to_results(table_data, row_count)

    output_table = defaultdict(list)

    #Performing filters
    filter_type = params["filter.filter"][0]
    if filter_type == "FDR":
        fdr_threshold = float(params["FDR.FDR"][0])
        for psm in psm_results:
            if psm["QValue"] < fdr_threshold:
                for key in psm:
                    output_table[key].append(psm[key])
    if filter_type == "PepFDR":
        fdr_threshold = float(params["PepFDR.PepFDR"][0])
        for psm in psm_results:
            if psm["PepQValue"] < fdr_threshold and psm["QValue"] < fdr_threshold:
                for key in psm:
                    output_table[key].append(psm[key])
    if filter_type == "FPR":
        print("Lets do nothing, don't know what this is")

    ming_fileio_library.write_dictionary_table_data(output_table, sys.argv[4])
def main():
    input_param = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))
    input_folder = sys.argv[2]
    output_file = sys.argv[3]
    scratch_folder = sys.argv[4]
    path_to_executable = sys.argv[5]
    path_to_isotopes_table = sys.argv[6]

    #parent_mass_tolerance = input_param[]
    parent_mass_tolerance = 0.05

    all_input_file_paths = ming_fileio_library.list_files_in_dir(input_folder)

    output_kl_intermediates = []
    for input_file in all_input_file_paths:
        output_kl_file = os.path.join(scratch_folder,
                                      os.path.basename(input_file) + ".kl")
        cmd = path_to_executable + " --input " + input_file + " --output_summary " + output_kl_file + " " + "--peak_tolerance " + str(
            parent_mass_tolerance
        ) + " --isotope_file " + path_to_isotopes_table + "  >/dev/null 2>&1 "
        print(cmd)
        os.system(cmd)
        #subprocess.call([cmd])
        output_kl_intermediates.append(output_kl_file)

    combined_table = defaultdict(list)
    for output_kl_file in output_kl_intermediates:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            output_kl_file)
        for key in table_data:
            combined_table[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(combined_table,
                                                    output_file)
def main():
    paramxml_filename = sys.argv[1]
    psms_input_file = sys.argv[2]
    kl_input_file = sys.argv[3]
    output_psms_file = sys.argv[4]

    parameters_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_filename))

    row_count, kl_data = ming_fileio_library.parse_table_with_headers(
        kl_input_file)
    kl_dict = {}
    for i in range(row_count):
        filename = os.path.basename(kl_data["Filename"][i])
        scan = kl_data["Scan"][i]
        kl_strict = (kl_data["KL Strict"][i])
        kl_unstrict = (kl_data["KL"][i])
        interpeak_intensity = (kl_data["Interpeak intensity"][i])
        key = filename + ":" + str(scan)
        kl_dict[key] = {
            "kl_strict": kl_strict,
            "kl_unstrict": kl_unstrict,
            "kl_interpeak": interpeak_intensity
        }

    #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary
    psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers(
        psms_input_file)
    psm_table_data["kl_strict"] = []
    psm_table_data["kl_unstrict"] = []
    psm_table_data["kl_interpeak"] = []
    for i in range(psm_rows):
        key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i]
        if key in kl_dict:
            psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"])
            psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"])
            psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"])
        else:
            psm_table_data["kl_strict"].append(-1)
            psm_table_data["kl_unstrict"].append(-1)
            psm_table_data["kl_interpeak"].append(-1)

    #Change C to C+57
    #if "cysteine_protease.cysteine" in parameters_obj:
    #    if parameters_obj["cysteine_protease.cysteine"][0] == "c57":
    #        #Lets replace all the cysteines
    #        for i in range(psm_rows):
    #            psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57")

    ming_fileio_library.write_dictionary_table_data(psm_table_data,
                                                    output_psms_file)
def grab_all_results(task_list, output_peptide_directory, output_psm_directory,
                     output_summary_filename):
    results_list = []
    for task in task_list:
        ret_value = grab_single_result(task, output_peptide_directory,
                                       output_psm_directory)
        results_list.append(ret_value)

    summary_dictionary = defaultdict(list)
    for result in results_list:
        for key in result.keys():
            summary_dictionary[key].append(result[key])

    ming_fileio_library.write_dictionary_table_data(summary_dictionary,
                                                    output_summary_filename)
Example #6
0
def main():
    input_folder_path = sys.argv[1]
    output_tsv = sys.argv[2]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)

    merged_dict = defaultdict(list)

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file)
        for key in table_data:
            merged_dict[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder)
    output_map = defaultdict(list)
    for parallel_output_filename in all_intermediate_files:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(parallel_output_filename)
        for key in table_data:
            output_map[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def main():
    input_folder_path = sys.argv[1]
    param_xml_filename = sys.argv[2]
    output_tsv = sys.argv[3]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)
    params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename))

    top_k = 1
    try:
        top_k = int(params_obj["TOP_K_RESULTS"][0])
    except:
        top_k = 1

    #merged_dict = defaultdict(list)
    merged_results = []

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file)
        for i in range(row_count):
            result_dict = {}
            for key in table_data:
                result_dict[key] = table_data[key][i]
            merged_results.append(result_dict)


    results_per_spectrum = defaultdict(list)

    for result_obj in merged_results:
        spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"]

        results_per_spectrum[spectrum_unique_key].append(result_obj)

    output_results = []
    for spectrum_unique_key in results_per_spectrum:
        sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True)
        filtered_results = sorted_results[:top_k]
        output_results += filtered_results

    output_dict = defaultdict(list)

    for result_obj in output_results:
        for key in result_obj:
            output_dict[key].append(result_obj[key])


    ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
Example #9
0
def main():
    input_folder_path = sys.argv[1]
    output_tsv = sys.argv[2]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)

    merged_dict = defaultdict(list)

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            input_file)
        for key in table_data:
            merged_dict[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def main():
    input_folder = sys.argv[1]
    input_tsvfile = sys.argv[2]
    output_tsvfile = sys.argv[3]

    allowed_passthrough_extensions = []
    extension_conversion_mapping = {}

    for i in range(4, len(sys.argv)):
        print(i)
        conversion_parameter = sys.argv[i]
        print(conversion_parameter)
        from_extension = conversion_parameter.split(":")[0]
        to_extension = conversion_parameter.split(":")[1]
        extension_conversion_mapping[from_extension] = to_extension

        if from_extension == to_extension:
            allowed_passthrough_extensions.append(from_extension)

    file_renaming_reverse_mapping = {}

    all_input_files = [
        os.path.join(input_folder, f) for f in os.listdir(input_folder)
        if os.path.isfile(os.path.join(input_folder, f))
    ]
    for input_file in all_input_files:
        input_extension = os.path.splitext(input_file)[1][1:]
        if input_extension in extension_conversion_mapping:
            renamed = os.path.splitext(
                os.path.basename(input_file)
            )[0] + "." + extension_conversion_mapping[input_extension]
            file_renaming_reverse_mapping[renamed] = os.path.basename(
                input_file)

    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_tsvfile)

    for header in table_data:
        for i in range(row_count):
            for find_to_replace in file_renaming_reverse_mapping:
                table_data[header][i] = table_data[header][i].replace(
                    find_to_replace,
                    file_renaming_reverse_mapping[find_to_replace])

    ming_fileio_library.write_dictionary_table_data(table_data, output_tsvfile)
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    output_map = defaultdict(list)
    for parallel_output_filename in all_intermediate_files:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            parallel_output_filename)
        for key in table_data:
            output_map[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_filename)
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    output_matches_filename = sys.argv[3]
    output_filename_unique_files = sys.argv[4]
    output_filename_all_matches = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_description" : [], "matchedpeaks" : [], "mzerror" : []}

    match_parameters = get_parameters(params_obj)

    try:
       if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
           ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
           exit(0)
    except:
       ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
       exit(0)

    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    dataset_dict = params_map["dataset_dict"]
    all_datasets = params_map["all_datasets"]

    SEARCH_RAW = False
    try:
        if params_obj["SEARCH_RAW"][0] == "1":
            SEARCH_RAW = True
    except:
        print("Param Not Found", "SEARCH_RAW")

    """Matchign Clustered Data"""
    if SEARCH_RAW:
        match_unclustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches)
    else:
        match_clustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches)
def name_demangle_filenames(input_file, output_file, path_to_param,
                            old_filename_header, new_filename_header):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_file)
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        ming_proteosafe_library.parse_xml_file(open(path_to_param)))

    if old_filename_header == new_filename_header:
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header][i] = unmangled_name
    else:
        table_data[new_filename_header] = []
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header].append(unmangled_name)

    ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main():
    input_file_of_tsv_results = sys.argv[1]
    input_params_xml_filename = sys.argv[2]
    input_library_identifications_filename = sys.argv[3]
    input_cutoff_scores = sys.argv[4]
    output_folder = sys.argv[5]

    output_filename = os.path.join(output_folder, os.path.basename(input_file_of_tsv_results))

    params_object = ming_proteosafe_library.parse_xml_file(open(input_params_xml_filename))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    library_scans_to_identification = library_scans_to_identification_info(input_library_identifications_filename)

    cutoff_dict = json.loads(open(input_cutoff_scores).read())

    psm_list = ming_psm_library.parse_MSGFPlus_tsvfile(input_file_of_tsv_results)
    output_results_dict = process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict)

    ming_fileio_library.write_dictionary_table_data(output_results_dict, output_filename)
Example #15
0
def main():
    input_intermediate_folder = sys.argv[1]
    output_file = sys.argv[2]

    output_dict = defaultdict(list)

    total_rows = 0
    input_filenames = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    for input_filename in input_filenames:
        if total_rows > 10000000:
            continue

        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            input_filename)
        total_rows += row_count
        for i in range(row_count):
            for key in table_data:
                output_dict[key].append(table_data[key][i])

    ming_fileio_library.write_dictionary_table_data(output_dict, output_file)
Example #16
0
def name_demangle_filenames_and_instrument_collision(input_file, output_file,
                                                     path_to_param,
                                                     path_to_original_results,
                                                     old_filename_header,
                                                     new_filename_header):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_file, skip_incomplete_lines=True)
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        ming_proteosafe_library.parse_xml_file(open(path_to_param)))

    if not "FragMethod" in table_data:
        print("Demangling", path_to_original_results, input_file)
        collision_mapping = get_scan_mapping_for_collision_method(
            path_to_original_results)

        #Adding collision column
        table_data["FragMethod"] = []
        print(len(table_data["filename"]), len(table_data["scan"]))
        for i in range(row_count):
            key = table_data["filename"][i] + "_" + table_data["scan"][i]
            if key in collision_mapping:
                table_data["FragMethod"].append(collision_mapping[key])
            else:
                table_data["FragMethod"].append("NO_COLLISION")

    if old_filename_header == new_filename_header:
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header][i] = unmangled_name
    else:
        table_data[new_filename_header] = []
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header].append(unmangled_name)

    ming_fileio_library.write_dictionary_table_data(table_data, output_file)
Example #17
0
def main():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]

    row_count, table_data = ming_fileio_library.parse_table_with_headers(input_filename)

    output_dict = defaultdict(list)

    max_fdr = 0.01


    for i in range(row_count):
        sequence = table_data["sequence"][i]
        modified_sequence = sequence[:-2]
        fdr = float(table_data["FDR"][i])
        if fdr > max_fdr:
            continue

        for key in table_data:
            output_dict[key].append(table_data[key][i])
        output_dict["modified_sequence"].append(modified_sequence)

    ming_fileio_library.write_dictionary_table_data(output_dict, output_filename)
Example #18
0
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    input_spectra_folder = sys.argv[3]
    library_search_results_filename = sys.argv[4]
    output_matches_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_input_filename))

    try:
        if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1":
            output_map = {"EMPTY": []}
            ming_fileio_library.write_dictionary_table_data(
                output_map, output_matches_filename)
            exit(0)
    except:
        output_map = {"EMPTY": []}
        ming_fileio_library.write_dictionary_table_data(
            output_map, output_matches_filename)
        exit(0)

    #Loading a dict of identifications
    identifications_map = load_identification_file_as_map(
        library_search_results_filename)

    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    all_datasets = params_map["all_datasets"]

    all_matches = finding_matches_in_public_data(
        os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets,
        identifications_map)

    output_map = defaultdict(list)
    for match in all_matches:
        for key in match:
            output_map[key].append(match[key])

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_matches_filename)
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    input_spectra_folder = sys.argv[3]
    library_search_results_filename = sys.argv[4]
    output_matches_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_neighbors" : [], "Compound_Name" : [], "SpectrumID" : []}

    try:
       if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
           ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
           exit(0)
    except:
       ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
       exit(0)


    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    dataset_dict = params_map["dataset_dict"]
    all_datasets = params_map["all_datasets"]

    #print(len(all_datasets))
    #print(partition_of_node)
    #print(partition_total)

    #all_datasets = all_datasets[partition_of_node::partition_total]

    all_matches = finding_matches_in_public_data(os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets)

    #Lets parse the search results and then populate this thing with search results
    library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers(library_search_results_filename)
    scan_to_library_map = {}
    for i in range(library_search_result_count):
        scan = library_search_data["Scan"][i]
        scan_to_library_map[scan] = {"Compound_Name" : library_search_data["Compound_Name"][i], "SpectrumID" : library_search_data["SpectrumID"][i]}

    for dataset in all_matches:
        #For each dataset, lets try to find the clustering information
        if len(all_matches[dataset]["matches"]) == 0:
            continue

        most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(dataset_dict[dataset]["task"])
        molecular_network = get_molecular_network_obj(most_recent_molecular_networking_job)

        for match in all_matches[dataset]["matches"]:
            output_map['specs_filename'].append("specs_ms.mgf")
            output_map['specs_scan'].append(match.query_scan)
            output_map['dataset_id'].append(dataset_dict[dataset]["dataset"])
            output_map['dataset_title'].append(dataset_dict[dataset]["title"])
            output_map['dataset_filename'].append(match.filename)
            output_map['dataset_scan'].append(match.scan)
            output_map['score'].append(match.score)

            #List the library identifications
            if str(match.query_scan) in scan_to_library_map:
                output_map['Compound_Name'].append(scan_to_library_map[str(match.query_scan)]["Compound_Name"])
                output_map['SpectrumID'].append(scan_to_library_map[str(match.query_scan)]["SpectrumID"])
            else:
                output_map['Compound_Name'].append("")
                output_map['SpectrumID'].append("")

            #Lets find all the analogs available
            if molecular_network != None:
                neighbors_in_dataset = molecular_network.get_node_neighbors(match.scan)
                output_map['dataset_neighbors'].append(len(neighbors_in_dataset))
            else:
                output_map['dataset_neighbors'].append(0)



    ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main():
    param_filename = sys.argv[1]
    choose_consensus_params_filename = sys.argv[2]
    filtered_peptide_list_filename = sys.argv[3]
    length_score_cutoff_filename = sys.argv[4]
    existing_library_spectra_folder = sys.argv[5]
    new_library_spectra_folder = sys.argv[6]

    output_library_json_folder = sys.argv[7]
    output_library_all_spectra_json_folder = sys.argv[8]
    output_candidate_spectra_tsv_folder = sys.argv[9]

    filtered_peptide_set = load_filtered_peptide_set(
        filtered_peptide_list_filename)
    score_cutoff_by_length = load_score_cutoff_by_length(
        filtered_peptide_list_filename)
    variant_to_score = load_variant_to_score(filtered_peptide_list_filename)

    #Deciding on how to create consensus
    params_obj = ming_proteosafe_library.parse_xml_file(open(param_filename))

    parallel_params = json.loads(open(choose_consensus_params_filename).read())
    total_node_count = parallel_params["total_paritions"]
    my_node_number = parallel_params["node_partition"]

    consensus_selection_method = params_obj["ConsensusChoice"][0]

    #output dict for listing all candidates
    library_candidates_output_dict = defaultdict(list)

    #determine filenames
    existing_library_filename, new_library_filename = determine_filenames_to_load(
        my_node_number, params_obj, existing_library_spectra_folder,
        new_library_spectra_folder)

    output_library_all_spectra_json_filename = os.path.join(
        output_library_all_spectra_json_folder,
        str(my_node_number) + ".json")
    output_library_all_spectra_json_file_handle = open(
        output_library_all_spectra_json_filename, "w")

    print(existing_library_filename, new_library_filename,
          output_library_all_spectra_json_folder,
          output_library_all_spectra_json_filename)

    print(len(existing_library_filename), existing_library_filename)
    print(len(new_library_filename), new_library_filename)

    library_spectra = []
    top_scoring_to_keep = 100
    top_per_dataset = 20

    #If we are starting from scratch, so no existing library
    if len(existing_library_filename) == 0 and len(new_library_filename) == 0:
        print("no files to load")
        exit(0)

    if len(existing_library_filename) == 0 and len(new_library_filename) != 0:
        print("New Only")
        input_spectrum_file_handle = open(new_library_filename)
        for line in input_spectrum_file_handle:
            all_spectra = json.loads(line)

            if len(all_spectra) == 0:
                continue

            #Filter to only top K scoring psms
            #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True)
            #all_spectra = all_spectra[:top_scoring_to_keep]
            """Filtering intelligently"""
            all_spectra = filter_out_spectra_to_top(all_spectra,
                                                    top_scoring_to_keep,
                                                    top_per_dataset)

            annotation = all_spectra[0]["annotation"] + "." + str(
                all_spectra[0]["charge"])
            print(annotation, len(all_spectra))

            if not annotation in filtered_peptide_set:
                output_library_all_spectra_json_file_handle.write(
                    json.dumps(all_spectra))
                output_library_all_spectra_json_file_handle.write("\n")
                continue

            output_library_all_spectra_json_file_handle.write(
                json.dumps(all_spectra))
            output_library_all_spectra_json_file_handle.write("\n")

            #Filter out spectra that do not pass minimum score by length
            #TODO

            library_spectrum = create_library_spectrum(
                all_spectra, consensus_selection_method,
                score_cutoff_by_length, variant_to_score,
                library_candidates_output_dict)
            library_spectra.append(library_spectrum)

    if len(existing_library_filename) != 0 and len(new_library_filename) != 0:
        print("New and Old")

        #load both files and iterate through them
        new_library_file_handle = open(new_library_filename)
        existing_library_file_handle = open(existing_library_filename)

        new_library_current_spectra_string = new_library_file_handle.readline()
        existing_library_current_spectra_string = existing_library_file_handle.readline(
        )

        new_library_current_spectra = []
        existing_library_current_spectra = []

        new_library_precursor = ""
        existing_library_precursor = ""

        parse_new_spectra = True
        parse_existing_spectra = True

        new_spectra_ended = False
        existing_spectra_ended = False

        #new_library_current_spectra = json.loads(new_library_file_handle.readline())
        #existing_library_current_spectra = json.loads(existing_library_file_handle.readline())

        #new_library_precursor = new_library_current_spectra[0]["annotation"] + "." + str(new_library_current_spectra[0]["charge"])
        #existing_library_precursor = existing_library_current_spectra[0]["annotation"] + "." + str(existing_library_current_spectra[0]["charge"])

        #print(new_library_precursor, existing_library_precursor)

        while True:
            print(len(existing_library_current_spectra_string),
                  len(new_library_current_spectra_string))

            if len(new_library_current_spectra_string) == 0:
                new_spectra_ended = True
                parse_new_spectra = False
            if len(existing_library_current_spectra_string) == 0:
                existing_spectra_ended = True
                parse_existing_spectra = False

            if existing_spectra_ended == True and new_spectra_ended == True:
                break

            if parse_new_spectra == True:
                new_library_current_spectra = json.loads(
                    new_library_current_spectra_string)
                new_library_precursor = new_library_current_spectra[0][
                    "annotation"] + "." + str(
                        new_library_current_spectra[0]["charge"])
            if parse_existing_spectra == True:
                existing_library_current_spectra = json.loads(
                    existing_library_current_spectra_string)
                existing_library_precursor = existing_library_current_spectra[
                    0]["annotation"] + "." + str(
                        existing_library_current_spectra[0]["charge"])

            if new_library_precursor == existing_library_precursor:
                print("FOUND BOTH")

                all_spectra = []
                all_spectra += new_library_current_spectra
                all_spectra += existing_library_current_spectra
                #Filter to only top K scoring psms
                #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True)
                #all_spectra = all_spectra[:top_scoring_to_keep]
                """Filtering intelligently"""
                all_spectra = filter_out_spectra_to_top(
                    all_spectra, top_scoring_to_keep, top_per_dataset)

                annotation = all_spectra[0]["annotation"] + "." + str(
                    all_spectra[0]["charge"])
                print(annotation, len(all_spectra))

                #Get new spectra
                new_library_current_spectra_string = new_library_file_handle.readline(
                )
                existing_library_current_spectra_string = existing_library_file_handle.readline(
                )
                parse_new_spectra = True
                parse_existing_spectra = True

                #Determing library
                if not annotation in filtered_peptide_set:
                    output_library_all_spectra_json_file_handle.write(
                        json.dumps(all_spectra))
                    output_library_all_spectra_json_file_handle.write("\n")
                    continue

                output_library_all_spectra_json_file_handle.write(
                    json.dumps(all_spectra))
                output_library_all_spectra_json_file_handle.write("\n")

                library_spectrum = create_library_spectrum(
                    all_spectra, consensus_selection_method,
                    score_cutoff_by_length, variant_to_score,
                    library_candidates_output_dict)
                library_spectra.append(library_spectrum)

            elif (new_library_precursor < existing_library_precursor
                  and new_spectra_ended
                  == False) or existing_spectra_ended == True:
                print("FOUND NEW", existing_spectra_ended, new_spectra_ended)

                all_spectra = new_library_current_spectra
                #Filter to only top K scoring psms
                #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True)
                #all_spectra = all_spectra[:top_scoring_to_keep]
                """Filtering intelligently"""
                all_spectra = filter_out_spectra_to_top(
                    all_spectra, top_scoring_to_keep, top_per_dataset)

                annotation = all_spectra[0]["annotation"] + "." + str(
                    all_spectra[0]["charge"])
                print(annotation, len(all_spectra))

                #Get new spectra
                new_library_current_spectra_string = new_library_file_handle.readline(
                )
                parse_new_spectra = True

                #Determing library
                if not annotation in filtered_peptide_set:
                    output_library_all_spectra_json_file_handle.write(
                        json.dumps(all_spectra))
                    output_library_all_spectra_json_file_handle.write("\n")
                    continue

                output_library_all_spectra_json_file_handle.write(
                    json.dumps(all_spectra))
                output_library_all_spectra_json_file_handle.write("\n")

                library_spectrum = create_library_spectrum(
                    all_spectra, consensus_selection_method,
                    score_cutoff_by_length, variant_to_score,
                    library_candidates_output_dict)
                library_spectra.append(library_spectrum)

            elif (new_library_precursor > existing_library_precursor
                  and existing_spectra_ended
                  == False) or new_spectra_ended == True:
                print("FOUND EXISTING")

                all_spectra = existing_library_current_spectra
                #Filter to only top K scoring psms
                #all_spectra = sorted(all_spectra, key=lambda spectrum: spectrum["score"], reverse=True)
                #all_spectra = all_spectra[:top_scoring_to_keep]
                """Filtering intelligently"""
                all_spectra = filter_out_spectra_to_top(
                    all_spectra, top_scoring_to_keep, top_per_dataset)

                annotation = all_spectra[0]["annotation"] + "." + str(
                    all_spectra[0]["charge"])
                print(annotation, len(all_spectra))

                #Get new spectra
                existing_library_current_spectra_string = existing_library_file_handle.readline(
                )
                parse_existing_spectra = True

                #Determing library
                if not annotation in filtered_peptide_set:
                    output_library_all_spectra_json_file_handle.write(
                        json.dumps(all_spectra))
                    output_library_all_spectra_json_file_handle.write("\n")
                    continue

                output_library_all_spectra_json_file_handle.write(
                    json.dumps(all_spectra))
                output_library_all_spectra_json_file_handle.write("\n")

                library_spectrum = create_library_spectrum(
                    all_spectra, consensus_selection_method,
                    score_cutoff_by_length, variant_to_score,
                    library_candidates_output_dict)
                library_spectra.append(library_spectrum)
            else:
                print("Problem with Ordering")

    json.dump(
        library_spectra,
        open(
            os.path.join(output_library_json_folder,
                         str(my_node_number) + ".json"), "w"))
    output_library_all_spectra_json_file_handle.close()

    #Outputting
    output_candidate_spectra_tsv_filename = os.path.join(
        output_candidate_spectra_tsv_folder,
        str(my_node_number) + ".tsv")
    ming_fileio_library.write_dictionary_table_data(
        library_candidates_output_dict, output_candidate_spectra_tsv_filename)
def main():
    input_result_filename = sys.argv[1]
    output_result_filename = sys.argv[2]

    spectrum_id_cache = {}


    input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename)

    output_table = defaultdict(list)

    output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"]
    output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"]
    output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"]
    output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"]

    for header in output_headers:
        output_table[header] = []

    number_hits_per_query = defaultdict(lambda: 0)

    for i in range(input_rows):
        number_hits_per_query[input_table["FileScanUniqueID"][i]] += 1


    for i in range(input_rows):
        spectrum_id = input_table["LibrarySpectrumID"][i]
        score = input_table["MQScore"][i]
        filename = input_table["SpectrumFile"][i]
        libfilename = input_table["LibraryName"][i]
        scan = input_table["#Scan#"][i]
        TIC_Query = input_table["UnstrictEvelopeScore"][i]
        RT_Query = input_table["p-value"][i]
        SpecCharge = input_table["Charge"][i]
        SpecMZ = input_table["SpecMZ"][i]
        MZErrorPPM = input_table["mzErrorPPM"][i]
        SharedPeaks = input_table["LibSearchSharedPeaks"][i]
        MassDiff = input_table["ParentMassDiff"][i]

        print(spectrum_id)
        gnps_library_spectrum = None
        try:
            gnps_library_spectrum = None
            if spectrum_id in spectrum_id_cache:
                gnps_library_spectrum = spectrum_id_cache[spectrum_id]
            else:
                gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id)
                spectrum_id_cache[spectrum_id] = gnps_library_spectrum
        except KeyboardInterrupt:
            raise
        except:
            continue

        gnps_library_spectrum["annotations"] = sorted(gnps_library_spectrum["annotations"], key=lambda annotation: annotation["create_time"], reverse=True)

        output_table["SpectrumID"].append(spectrum_id)
        output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", ""))
        output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"].replace("\t", ""))
        output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"].replace("\t", ""))
        output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"].replace("\t", ""))
        output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"].replace("\t", ""))
        output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"].replace("\t", ""))
        output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"].replace("\t", ""))
        output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"].replace("\t", ""))
        output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"].replace("\t", ""))
        output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"].replace("\t", ""))
        output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"].replace("\t", ""))
        output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"].replace("\t", ""))
        output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"].replace("\t", ""))
        output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"].replace("\t", ""))
        output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"].replace("\t", ""))
        output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"].replace("\t", ""))
        output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"].replace("\t", ""))

        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD")
            output_table["LibraryQualityString"].append("Gold")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "2":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER")
            output_table["LibraryQualityString"].append("Silver")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "3":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Bronze")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "4":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "5":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "10":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Challenge")
        else:
            print("BULLLSHIT", gnps_library_spectrum["annotations"][0]["Library_Class"])

        output_table["#Scan#"].append(scan)
        output_table["SpectrumFile"].append(filename)
        output_table["LibraryName"].append(libfilename)
        output_table["MQScore"].append(score)
        output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"])
        output_table["TIC_Query"].append(TIC_Query)
        output_table["RT_Query"].append(RT_Query)
        output_table["MZErrorPPM"].append(MZErrorPPM)
        output_table["SharedPeaks"].append(SharedPeaks)
        output_table["MassDiff"].append(MassDiff)
        output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"])
        output_table["SpecMZ"].append(SpecMZ)
        output_table["SpecCharge"].append(SpecCharge)
        output_table["FileScanUniqueID"].append(input_table["FileScanUniqueID"][i])
        output_table["NumberHits"].append(number_hits_per_query[input_table["FileScanUniqueID"][i]])


        tag_list = [ (tag["tag_desc"] + "[" + tag["tag_type"] + "]") for tag in gnps_library_spectrum["spectrum_tags"]]
        tag_string = "||".join(tag_list).replace("\t", "")

        output_table["tags"].append(tag_string)

    ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
Example #22
0
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    input_spectra_folder = sys.argv[3]
    library_search_results_filename = sys.argv[4]
    output_matches_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_input_filename))

    output_map = {
        "specs_filename": [],
        "specs_scan": [],
        "dataset_filename": [],
        "dataset_scan": [],
        "score": [],
        "dataset_id": [],
        "dataset_title": [],
        "dataset_neighbors": [],
        "Compound_Name": [],
        "SpectrumID": []
    }

    try:
        if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
            ming_fileio_library.write_dictionary_table_data(
                output_map, output_matches_filename)
            exit(0)
    except:
        ming_fileio_library.write_dictionary_table_data(
            output_map, output_matches_filename)
        exit(0)

    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    dataset_dict = params_map["dataset_dict"]
    all_datasets = params_map["all_datasets"]

    #print(len(all_datasets))
    #print(partition_of_node)
    #print(partition_total)

    #all_datasets = all_datasets[partition_of_node::partition_total]

    all_matches = finding_matches_in_public_data(
        os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets)

    #Lets parse the search results and then populate this thing with search results
    library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers(
        library_search_results_filename)
    scan_to_library_map = {}
    for i in range(library_search_result_count):
        scan = library_search_data["Scan"][i]
        scan_to_library_map[scan] = {
            "Compound_Name": library_search_data["Compound_Name"][i],
            "SpectrumID": library_search_data["SpectrumID"][i]
        }

    for dataset in all_matches:
        #For each dataset, lets try to find the clustering information
        if len(all_matches[dataset]["matches"]) == 0:
            continue

        most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(
            dataset_dict[dataset]["task"])
        molecular_network = get_molecular_network_obj(
            most_recent_molecular_networking_job)

        for match in all_matches[dataset]["matches"]:
            output_map['specs_filename'].append("specs_ms.mgf")
            output_map['specs_scan'].append(match.query_scan)
            output_map['dataset_id'].append(dataset_dict[dataset]["dataset"])
            output_map['dataset_title'].append(dataset_dict[dataset]["title"])
            output_map['dataset_filename'].append(match.filename)
            output_map['dataset_scan'].append(match.scan)
            output_map['score'].append(match.score)

            #List the library identifications
            if str(match.query_scan) in scan_to_library_map:
                output_map['Compound_Name'].append(scan_to_library_map[str(
                    match.query_scan)]["Compound_Name"])
                output_map['SpectrumID'].append(scan_to_library_map[str(
                    match.query_scan)]["SpectrumID"])
            else:
                output_map['Compound_Name'].append("")
                output_map['SpectrumID'].append("")

            #Lets find all the analogs available
            if molecular_network != None:
                neighbors_in_dataset = molecular_network.get_node_neighbors(
                    match.scan)
                output_map['dataset_neighbors'].append(
                    len(neighbors_in_dataset))
            else:
                output_map['dataset_neighbors'].append(0)

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_matches_filename)
 def write_summary(self, output_filename):
     ming_fileio_library.write_dictionary_table_data(self.produce_protein_dict(), output_filename)
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(
        open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)

    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(
                file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data,
                                                    args.output_metadata_file)
def main():
    param_filename = sys.argv[1]
    choose_consensus_params_filename = sys.argv[2]
    filtered_peptide_list_filename = sys.argv[3]
    length_score_cutoff_filename = sys.argv[4]
    provenance_json_filename = sys.argv[5]
    merged_library_spectra_folder = sys.argv[6]

    output_library_json_folder = sys.argv[7]
    output_candidate_spectra_tsv_folder = sys.argv[8]

    filtered_peptide_set = load_filtered_peptide_set(
        filtered_peptide_list_filename)
    score_cutoff_by_length = load_score_cutoff_by_length(
        filtered_peptide_list_filename)
    variant_to_score = load_variant_to_score(filtered_peptide_list_filename)

    #Deciding on how to create consensus
    params_obj = ming_proteosafe_library.parse_xml_file(open(param_filename))

    parallel_params = json.loads(open(choose_consensus_params_filename).read())
    total_node_count = parallel_params["total_paritions"]
    my_node_number = parallel_params["node_partition"]

    consensus_selection_method = params_obj["ConsensusChoice"][0]

    #output dict for listing all candidates
    library_candidates_output_dict = defaultdict(list)

    #determine filenames
    merged_library_filename, my_position_for_file, total_nodes_for_file = determine_filenames_to_load(
        my_node_number, total_node_count, merged_library_spectra_folder)

    print(merged_library_filename, my_position_for_file, total_nodes_for_file)

    library_spectra = []

    input_spectrum_file_handle = open(merged_library_filename)
    line_count = 0
    for line in input_spectrum_file_handle:
        line_count += 1
        if line_count % total_nodes_for_file != my_position_for_file:
            #print("Should Skip")
            continue
        else:
            print("NOT SKIP")

        all_spectra = json.loads(line)

        if len(all_spectra) == 0:
            continue

        annotation = all_spectra[0]["annotation"] + "." + str(
            all_spectra[0]["charge"])
        print(annotation, len(all_spectra))

        if not annotation in filtered_peptide_set:
            continue

        library_spectrum = create_library_spectrum(
            all_spectra, consensus_selection_method, score_cutoff_by_length,
            variant_to_score, library_candidates_output_dict)
        library_spectra.append(library_spectrum)

    json.dump(
        library_spectra,
        open(
            os.path.join(output_library_json_folder,
                         str(my_node_number) + ".json"), "w"))

    #Provenance Records
    provenance_records = json.loads(open(provenance_json_filename).read())

    #Modifying the output candidate file
    for i in range(len(library_candidates_output_dict["filename"])):
        proteosafe_task = library_candidates_output_dict["proteosafe_task"][i]
        if proteosafe_task in provenance_records["search_task_to_augment"]:
            library_candidates_output_dict["augment_task"].append(
                provenance_records["search_task_to_augment"][proteosafe_task])
        else:
            library_candidates_output_dict["augment_task"].append("")

        if proteosafe_task in provenance_records["search_task_to_extraction"]:
            library_candidates_output_dict["extract_task"].append(
                provenance_records["search_task_to_extraction"]
                [proteosafe_task])
        else:
            library_candidates_output_dict["extract_task"].append("")

    #Outputting
    output_candidate_spectra_tsv_filename = os.path.join(
        output_candidate_spectra_tsv_folder,
        str(my_node_number) + ".tsv")
    ming_fileio_library.write_dictionary_table_data(
        library_candidates_output_dict, output_candidate_spectra_tsv_filename)
    """Converted Output"""
    output_tsv_folder = sys.argv[9]
    output_mgf_folder = sys.argv[10]
    output_sptxt_folder = sys.argv[11]

    library_spectrum_collection = ming_spectrum_library.SpectrumCollection(
        "library spectra")

    for library_spectrum in library_spectra:
        lib_spec = ming_spectrum_library.PeptideLibrarySpectrum(
            "", 0, 0, library_spectrum["peaks"], library_spectrum["mz"],
            library_spectrum["charge"], library_spectrum["annotation"],
            library_spectrum["protein"])
        if "score" in library_spectrum:
            lib_spec.score = library_spectrum["score"]
        if "variant_score" in library_spectrum:
            lib_spec.variant_score = library_spectrum["variant_score"]
        if "spectra_to_consider" in library_spectrum:
            lib_spec.num_spectra = library_spectrum["spectra_to_consider"]
        if "ranking" in library_spectrum:
            lib_spec.spectrum_ranking = library_spectrum["ranking"]
        if "proteosafe_task" in library_spectrum:
            lib_spec.proteosafe_task = library_spectrum["proteosafe_task"]
        if "originalspectrum_filename" in library_spectrum:
            lib_spec.originalfile_filename = library_spectrum[
                "originalspectrum_filename"]
        if "originalspectrum_scan" in library_spectrum:
            lib_spec.originalfile_scan = str(
                library_spectrum["originalspectrum_scan"])

        library_spectrum_collection.spectrum_list.append(lib_spec)

    output_mgf_filename = os.path.join(output_mgf_folder,
                                       str(my_node_number) + ".mgf")
    output_tsv_filename = os.path.join(output_tsv_folder,
                                       str(my_node_number) + ".tsv")
    output_sptxt_filename = os.path.join(output_sptxt_folder,
                                         str(my_node_number) + ".sptxt")

    library_spectrum_collection.save_to_mgf(open(output_mgf_filename, "w"))
    library_spectrum_collection.save_to_tsv(open(output_tsv_filename, "w"),
                                            output_mgf_filename)

    try:
        library_spectrum_collection.save_to_sptxt(
            open(output_sptxt_filename, "w"))
    except:
        traceback.print_exc(file=sys.stdout)
        print("MEH")
Example #26
0
def main():
    input_json = json.loads(open(sys.argv[1]).read())
    input_intermediate_folder = sys.argv[2]
    output_folder = sys.argv[3]
    output_peptide_list_folder = sys.argv[4]

    my_node = input_json["node_partition"]

    output_filename = os.path.join(output_folder, str(my_node) + ".json")
    output_file = open(output_filename, "w")
    number_of_spectra = 0

    input_json_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    input_json_files.sort()

    all_spectra = []

    for json_filename in input_json_files:
        #Skip files
        json_basename = os.path.basename(json_filename).split(".")[0]
        bin_peptide = int(json_basename.split("_")[2])
        if bin_peptide != my_node:
            continue

        print("Loading", json_filename)
        spectrum_list = json.load(open(json_filename))
        all_spectra += spectrum_list
        print("Total Spectra", len(spectrum_list), len(all_spectra))

    peptide_dict = defaultdict(list)
    print("Creating hash")
    for spectrum in all_spectra:
        annotation = spectrum["annotation"] + "." + str(spectrum["charge"])
        peptide_dict[annotation].append(spectrum)

    print("writing out strings")
    all_annotation = list(peptide_dict.keys())
    all_annotation.sort()
    for annotation in all_annotation:
        output_file.write(json.dumps(peptide_dict[annotation]))
        output_file.write("\n")

    output_file.close()

    #Write out all the peptides into a file
    output_peptide_dict = defaultdict(list)
    for annotation_key in peptide_dict:
        max_score = -10
        if len(peptide_dict[annotation_key]) > 0:
            for spectrum in peptide_dict[annotation_key]:
                max_score = max(spectrum["score"], max_score)
            #max score per peptide
            output_peptide_dict["score"].append(max_score)
            output_peptide_dict["annotation_key"].append(annotation_key)
            output_peptide_dict["annotation"].append(
                peptide_dict[annotation_key][0]["annotation"])
            output_peptide_dict["charge"].append(
                peptide_dict[annotation_key][0]["charge"])
            output_peptide_dict["protein"].append(
                peptide_dict[annotation_key][0]["protein"])

    #writing out file
    output_peptide_filename = os.path.join(output_peptide_list_folder,
                                           str(my_node) + ".tsv")
    ming_fileio_library.write_dictionary_table_data(output_peptide_dict,
                                                    output_peptide_filename)
Example #27
0
def main():
    input_result_filename = sys.argv[1]
    output_result_filename = sys.argv[2]


    input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename)

    output_table = defaultdict(list)

    output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"]
    output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"]
    output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"]
    output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"]

    for header in output_headers:
        output_table[header] = []

    for i in range(input_rows):
        spectrum_id = input_table["LibrarySpectrumID"][i]
        score = input_table["MQScore"][i]
        filename = input_table["SpectrumFile"][i]
        libfilename = input_table["LibraryName"][i]
        scan = input_table["#Scan#"][i]
        TIC_Query = input_table["UnstrictEvelopeScore"][i]
        RT_Query = input_table["p-value"][i]
        SpecCharge = input_table["Charge"][i]
        SpecMZ = input_table["SpecMZ"][i]
        MZErrorPPM = input_table["mzErrorPPM"][i]
        SharedPeaks = input_table["LibSearchSharedPeaks"][i]
        MassDiff = input_table["ParentMassDiff"][i]

        print(spectrum_id)
        gnps_library_spectrum = None
        try:
            gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id)
        except KeyboardInterrupt:
            raise
        except:
            continue

        output_table["SpectrumID"].append(spectrum_id)
        output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"])
        output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"])
        output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"])
        output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"])
        output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"])
        output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"])
        output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"])
        output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"])
        output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"])
        output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"])
        output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"])
        output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"])
        output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"])
        output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"])
        output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"])
        output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"])
        output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"])

        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD")
            output_table["LibraryQualityString"].append("Gold")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "2":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER")
            output_table["LibraryQualityString"].append("Silver")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "3":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Bronze")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "4":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "10":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Challenge")

        output_table["#Scan#"].append(scan)
        output_table["SpectrumFile"].append(filename)
        output_table["LibraryName"].append(libfilename)
        output_table["MQScore"].append(score)
        output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"])
        output_table["TIC_Query"].append(TIC_Query)
        output_table["RT_Query"].append(RT_Query)
        output_table["MZErrorPPM"].append(MZErrorPPM)
        output_table["SharedPeaks"].append(SharedPeaks)
        output_table["MassDiff"].append(MassDiff)
        output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"])
        output_table["SpecMZ"].append(SpecMZ)
        output_table["SpecCharge"].append(SpecCharge)

        tag_string = ""
        for tag in gnps_library_spectrum["spectrum_tags"]:
            tag_string += tag["tag_desc"].replace("\t", "") + "||"

        if len(tag_string) > 3:
            tag_string = tag_string[:-2]


        output_table["tags"].append(tag_string)

    ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
def match_clustered(match_parameters, spectrum_collection, dataset_dict,
                    all_datasets, output_matches_filename,
                    output_filename_unique_files, output_filename_all_matches):
    all_matches = finding_matches_in_public_data(spectrum_collection,
                                                 all_datasets,
                                                 match_parameters)
    """Resolving to File Level"""
    dataset_files_count = defaultdict(lambda: 0)
    output_source_list = []
    output_match_list = []

    MetaDataServerStatus = trace_to_single_file.test_metadata_server()

    for dataset in all_matches:
        for match_object in all_matches[dataset]["matches"]:
            dataset_accession = dataset_dict[dataset]["dataset"]
            dataset_scan = match_object["scan"]
            current_filelist, current_match_list = trace_to_single_file.trace_filename_filesystem(
                all_datasets,
                dataset_accession,
                dataset_scan,
                enrichmetadata=MetaDataServerStatus)
            output_source_list += current_filelist
            output_match_list += current_match_list

    seen_files = set()
    output_unique_source_list = []
    for output_file_object in output_source_list:
        dataset_accession = output_file_object["dataset_id"]
        dataset_filename = output_file_object["filename"]

        key = dataset_accession + ":" + dataset_filename
        if key in seen_files:
            continue

        dataset_files_count[dataset_accession] += 1

        seen_files.add(key)

        output_unique_source_list.append(output_file_object)

    ming_fileio_library.write_list_dict_table_data(
        output_unique_source_list, output_filename_unique_files)
    ming_fileio_library.write_list_dict_table_data(
        output_match_list, output_filename_all_matches)
    """ Summary """
    output_map = {
        "specs_filename": [],
        "specs_scan": [],
        "dataset_filename": [],
        "dataset_scan": [],
        "score": [],
        "dataset_id": [],
        "dataset_title": [],
        "dataset_description": [],
        "dataset_organisms": [],
        "matchedpeaks": [],
        "mzerror": [],
        "files_count": []
    }
    for dataset in all_matches:
        #For each dataset, lets try to find the clustering information
        if len(all_matches[dataset]["matches"]) == 0:
            continue

        match_object = None

        #If it is more than one match, we need to consolidate
        if len(all_matches[dataset]["matches"]) > 1:
            sorted_match_list = sorted(
                all_matches[dataset]["matches"],
                key=lambda match: float(match["cosine"]),
                reverse=True)
            match_object = sorted_match_list[0]
        else:
            match_object = all_matches[dataset]["matches"][0]

        output_map['specs_filename'].append("specs_ms.mgf")
        output_map['specs_scan'].append(match_object["queryscan"])
        output_map['dataset_id'].append(dataset_dict[dataset]["dataset"])
        output_map['dataset_title'].append(dataset_dict[dataset]["title"])
        output_map['dataset_description'].append(
            dataset_dict[dataset]["description"].replace("\n", "").replace(
                "\t", "").replace("\r", ""))
        output_map['dataset_organisms'].append(
            dataset_dict[dataset]["species"].replace(
                "<hr class='separator'\/>", "!"))
        output_map['dataset_filename'].append(match_object["filename"])
        output_map['dataset_scan'].append(match_object["scan"])
        output_map['score'].append(match_object["cosine"])
        output_map['matchedpeaks'].append(match_object["matchedpeaks"])
        output_map['mzerror'].append(match_object["mzerror"])
        output_map['files_count'].append(dataset_files_count[dataset])

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_matches_filename)
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)



    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
Example #30
0
def main():
    print(sys.argv)
    paramxml_filename = sys.argv[1]
    psms_input_file = sys.argv[2]
    kl_input_file = sys.argv[3]

    output_psms_file = sys.argv[4]
    output_decoy_psms_file = sys.argv[5]

    parameters_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_filename))

    target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files(
        parameters_obj)

    input_psm_set = ming_psm_library.PSMset("input psms")
    input_psm_set.load_PSM_tsvfile(psms_input_file, load_extra_metadata=True)

    decoy_psm_set = ming_psm_library.PSMset("decoy psms")
    decoy_psm_set.psms = input_psm_set.synthetic_psms_by_length_decoy_set(
        target_filename_list, decoy_filename_list)

    print("GETTING ALL SYNETHTIC with 0% FDR")
    input_psm_set.filter_synthetic_psms_by_length(target_filename_list,
                                                  decoy_filename_list)

    row_count, kl_data = ming_fileio_library.parse_table_with_headers(
        kl_input_file)
    kl_dict = {}
    for i in range(row_count):
        filename = os.path.basename(kl_data["Filename"][i])
        scan = kl_data["Scan"][i]
        kl_strict = (kl_data["KL Strict"][i])
        kl_unstrict = (kl_data["KL"][i])
        interpeak_intensity = (kl_data["Interpeak intensity"][i])
        key = filename + ":" + str(scan)
        kl_dict[key] = {
            "kl_strict": kl_strict,
            "kl_unstrict": kl_unstrict,
            "kl_interpeak": interpeak_intensity
        }

    output_file = open(output_psms_file, "w")
    input_psm_set.write_output(output_file, write_extra_metadata=True)
    decoy_psm_set.write_output(open(output_decoy_psms_file, "w"),
                               write_extra_metadata=True)
    output_file.close()

    #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary
    psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers(
        output_psms_file)

    psm_table_data["kl_strict"] = []
    psm_table_data["kl_unstrict"] = []
    psm_table_data["kl_interpeak"] = []

    psm_table_data["ambiguity_total_score"] = []
    psm_table_data["first_second_unique_ratio"] = []
    psm_table_data["first_unique_count"] = []
    psm_table_data["first_unique_intensity"] = []
    psm_table_data["numberpsms"] = []
    psm_table_data["second_unique_count"] = []
    psm_table_data["second_unique_intensity"] = []
    psm_table_data["spectrum_unique_key"] = []
    psm_table_data["modified_sequence"] = []

    for i in range(psm_rows):
        key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i]
        if key in kl_dict:
            psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"])
            psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"])
            psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"])
        else:
            psm_table_data["kl_strict"].append(-1)
            psm_table_data["kl_unstrict"].append(-1)
            psm_table_data["kl_interpeak"].append(-1)

        #writing the ambiguity stuff, but just assuming no ambiguity
        psm_table_data["ambiguity_total_score"].append("-1")
        psm_table_data["first_second_unique_ratio"].append("-1")
        psm_table_data["first_unique_count"].append("-1")
        psm_table_data["first_unique_intensity"].append("-1")
        psm_table_data["numberpsms"].append(1)
        psm_table_data["second_unique_count"].append("-1")
        psm_table_data["second_unique_intensity"].append("-1")
        psm_table_data["spectrum_unique_key"].append(key)
        psm_table_data["modified_sequence"].append(
            psm_table_data["sequence"][i][:-2])

    ming_fileio_library.write_dictionary_table_data(psm_table_data,
                                                    output_psms_file)
def main():
    psms_input_file = sys.argv[1]
    input_spectrum_folder = sys.argv[2]
    output_psms_file = sys.argv[3]

    psms_row, psm_table = ming_fileio_library.parse_table_with_headers(
        psms_input_file)

    peak_tolerance = 0.1

    #Determine which ones have possible bad ambiguity
    spectrum_to_number_psms_dict = defaultdict(lambda: 0)
    psm_table["spectrum_unique_key"] = []
    for i in range(psms_row):
        filename = psm_table["filename"][i]
        scan = psm_table["scan"][i]
        key = filename + ":" + scan
        psm_table["spectrum_unique_key"].append(key)
        spectrum_to_number_psms_dict[key] += 1

    psm_table["numberpsms"] = []
    spectra_to_reconsider = defaultdict(lambda: defaultdict(list))
    for i in range(psms_row):
        filename = psm_table["filename"][i]
        scan = psm_table["scan"][i]
        key = filename + ":" + scan
        number_of_psms_per_spectrum = spectrum_to_number_psms_dict[key]
        psm_table["numberpsms"].append(number_of_psms_per_spectrum)

        if number_of_psms_per_spectrum > 1:
            spectra_to_reconsider[filename][scan].append(
                psm_table["sequence"][i][:-2])

    spectrum_to_ambiguity_mapping = {}
    for filename in spectra_to_reconsider:
        scan_mapping = spectra_to_reconsider[filename]
        parameter_object = {}
        parameter_object["filename"] = os.path.join(input_spectrum_folder,
                                                    filename)
        parameter_object["scan_mapping"] = scan_mapping
        print(parameter_object)
        scan_ambiguity_mapping = calculated_ambiguity(parameter_object,
                                                      peak_tolerance)
        for key in scan_ambiguity_mapping:
            full_spectrum_key = "%s:%s" % (filename, key)
            spectrum_to_ambiguity_mapping[
                full_spectrum_key] = scan_ambiguity_mapping[key]

    psm_table["ambiguity_total_score"] = []
    psm_table["first_unique_count"] = []
    psm_table["second_unique_count"] = []
    psm_table["first_unique_intensity"] = []
    psm_table["second_unique_intensity"] = []
    psm_table["first_second_unique_ratio"] = []
    for i in range(psms_row):
        filename = psm_table["filename"][i]
        scan = psm_table["scan"][i]
        key = filename + ":" + scan
        if key in spectrum_to_ambiguity_mapping:
            psm_table["ambiguity_total_score"].append(
                spectrum_to_ambiguity_mapping[key]["ambiguity_total_score"])
            psm_table["first_unique_count"].append(
                spectrum_to_ambiguity_mapping[key]["first_unique_count"])
            psm_table["second_unique_count"].append(
                spectrum_to_ambiguity_mapping[key]["second_unique_count"])
            psm_table["first_unique_intensity"].append(
                spectrum_to_ambiguity_mapping[key]["first_unique_intensity"])
            psm_table["second_unique_intensity"].append(
                spectrum_to_ambiguity_mapping[key]["second_unique_intensity"])
            psm_table["first_second_unique_ratio"].append(
                spectrum_to_ambiguity_mapping[key]
                ["first_second_unique_ratio"])
        else:
            psm_table["ambiguity_total_score"].append(-1)
            psm_table["first_unique_count"].append(-1)
            psm_table["second_unique_count"].append(-1)
            psm_table["first_unique_intensity"].append(-1)
            psm_table["second_unique_intensity"].append(-1)
            psm_table["first_second_unique_ratio"].append(-1)

    ming_fileio_library.write_dictionary_table_data(psm_table,
                                                    output_psms_file)
def main():
    input_result_filename = sys.argv[1]
    output_result_filename = sys.argv[2]

    spectrum_id_cache = {}


    input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename)

    output_table = defaultdict(list)

    output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"]
    output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"]
    output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"]
    output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"]
    output_headers += ["MoleculeExplorerDatasets", "MoleculeExplorerFiles"]

    for header in output_headers:
        output_table[header] = []

    number_hits_per_query = defaultdict(lambda: 0)

    for i in range(input_rows):
        number_hits_per_query[input_table["FileScanUniqueID"][i]] += 1

    molecule_explorer_df = pd.DataFrame(ming_gnps_library.get_molecule_explorer_dataset_data())

    for i in range(input_rows):
        spectrum_id = input_table["LibrarySpectrumID"][i]
        score = input_table["MQScore"][i]
        filename = input_table["SpectrumFile"][i]
        libfilename = input_table["LibraryName"][i]
        scan = input_table["#Scan#"][i]
        TIC_Query = input_table["UnstrictEvelopeScore"][i]
        RT_Query = input_table["p-value"][i]
        SpecCharge = input_table["Charge"][i]
        SpecMZ = input_table["SpecMZ"][i]
        MZErrorPPM = input_table["mzErrorPPM"][i]
        SharedPeaks = input_table["LibSearchSharedPeaks"][i]
        MassDiff = input_table["ParentMassDiff"][i]

        print(spectrum_id)
        gnps_library_spectrum = None
        try:
            gnps_library_spectrum = None
            if spectrum_id in spectrum_id_cache:
                gnps_library_spectrum = spectrum_id_cache[spectrum_id]
            else:
                gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id)
                spectrum_id_cache[spectrum_id] = gnps_library_spectrum
        except KeyboardInterrupt:
            raise
        except:
            continue

        gnps_library_spectrum["annotations"] = sorted(gnps_library_spectrum["annotations"], key=lambda annotation: annotation["create_time"], reverse=True)

        output_table["SpectrumID"].append(spectrum_id)
        output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", ""))
        output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"].replace("\t", ""))
        output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"].replace("\t", ""))
        output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"].replace("\t", ""))
        output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"].replace("\t", ""))
        output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"].replace("\t", ""))
        output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"].replace("\t", ""))
        output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"].replace("\t", ""))
        output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"].replace("\t", ""))
        output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"].replace("\t", ""))
        output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"].replace("\t", ""))
        output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"].replace("\t", ""))
        output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"].replace("\t", ""))
        output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"].replace("\t", ""))
        output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"].replace("\t", ""))
        output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"].replace("\t", ""))
        output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"].replace("\t", ""))

        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD")
            output_table["LibraryQualityString"].append("Gold")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "2":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER")
            output_table["LibraryQualityString"].append("Silver")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "3":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Bronze")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "4":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "5":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "10":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Challenge")
        else:
            print("BULLLSHIT", gnps_library_spectrum["annotations"][0]["Library_Class"])

        output_table["#Scan#"].append(scan)
        output_table["SpectrumFile"].append(filename)
        output_table["LibraryName"].append(libfilename)
        output_table["MQScore"].append(score)
        output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"])
        output_table["TIC_Query"].append(TIC_Query)
        output_table["RT_Query"].append(RT_Query)
        output_table["MZErrorPPM"].append(MZErrorPPM)
        output_table["SharedPeaks"].append(SharedPeaks)
        output_table["MassDiff"].append(MassDiff)
        output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"])
        output_table["SpecMZ"].append(SpecMZ)
        output_table["SpecCharge"].append(SpecCharge)
        output_table["FileScanUniqueID"].append(input_table["FileScanUniqueID"][i])
        output_table["NumberHits"].append(number_hits_per_query[input_table["FileScanUniqueID"][i]])


        tag_list = [ (tag["tag_desc"] + "[" + tag["tag_type"] + "]") for tag in gnps_library_spectrum["spectrum_tags"]]
        tag_string = "||".join(tag_list).replace("\t", "")

        output_table["tags"].append(tag_string)

        #Getting molecule explorer information
        compound_name = gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", "")
        compound_filtered_df = molecule_explorer_df[molecule_explorer_df["compound_name"] == compound_name]
        if len(compound_filtered_df) == 1:
            output_table["MoleculeExplorerDatasets"].append(compound_filtered_df.to_dict(orient="records")[0]["number_datasets"])
            output_table["MoleculeExplorerFiles"].append(compound_filtered_df.to_dict(orient="records")[0]["number_files"])
        else:
            output_table["MoleculeExplorerDatasets"].append(0)
            output_table["MoleculeExplorerFiles"].append(0)


    ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
Example #33
0
def main():
    params_obj = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))

    augment_task_id = params_obj["task"][0]

    all_tasks_output_dict = defaultdict(list)
    all_augments_output_dict = defaultdict(list)
    all_spectrum_files_output_dict = defaultdict(list)

    search_task_to_augment = {}
    search_task_to_extraction = {}

    all_search_tasks = set()

    process_tree = True
    while process_tree:
        print("AUGMENT", augment_task_id, len(augment_task_id))
        augment_task_information = ming_proteosafe_library.get_task_information(
            "proteomics2.ucsd.edu", augment_task_id)

        extract_task_id = ""
        previous_augment_task_id = ""

        for filename in augment_task_information["files"]:
            if filename.find("unfiltered_peptide_list") != -1:
                previous_augment_task_id = ming_fileio_library.get_root_folder(
                    filename.replace(
                        ming_fileio_library.get_root_folder(filename) + "/",
                        ""))
            if filename.find("extracted_spectra_peptides_merged") != -1:
                extract_task_id = ming_fileio_library.get_root_folder(
                    filename.replace(
                        ming_fileio_library.get_root_folder(filename) + "/",
                        ""))

        previous_augment_task_id = previous_augment_task_id.strip()
        if len(previous_augment_task_id) < 10:
            process_tree = False

        print(previous_augment_task_id, extract_task_id)

        all_augments_output_dict["augment_task"].append(augment_task_id)
        all_augments_output_dict["extract_task"].append(extract_task_id)
        all_augments_output_dict["precursor_count"].append(0)
        all_augments_output_dict["timestamp"].append(
            augment_task_information["createtime"])

        #Processing extract task_id
        extract_task_info = ming_proteosafe_library.get_task_information(
            "proteomics2.ucsd.edu", extract_task_id)
        extract_task_parameters = ming_proteosafe_library.get_task_parameters(
            "proteomics2.ucsd.edu", extract_task_id)

        tasks_to_extract = json.loads(
            extract_task_parameters["tasks_to_consolidate"][0])

        for task in tasks_to_extract:
            search_task_to_augment[task] = augment_task_id
            search_task_to_extraction[task] = extract_task_id

            all_tasks_output_dict["search_task_id"].append(task)
            all_tasks_output_dict["extract_task_id"].append(extract_task_id)
            all_tasks_output_dict["augment_task_id"].append(augment_task_id)

            all_search_tasks.add(task)

        print(extract_task_parameters["task_file"][0])
        path_to_task_file = os.path.join(
            "/data/ccms-data/uploads",
            extract_task_parameters["task_file"][0][2:-1])
        if os.path.isfile(path_to_task_file):
            print("SEARCH FILE", path_to_task_file)
            try:
                row_count, table_data = ming_fileio_library.parse_table_with_headers(
                    path_to_task_file)
                print("Rows", row_count)
                for i in range(row_count):
                    search_task_id = table_data["TASKID"][i]
                    print(i, search_task_id)

                    search_task_to_augment[search_task_id] = augment_task_id
                    search_task_to_extraction[search_task_id] = extract_task_id

                    all_tasks_output_dict["search_task_id"].append(
                        search_task_id)
                    all_tasks_output_dict["extract_task_id"].append(
                        extract_task_id)
                    all_tasks_output_dict["augment_task_id"].append(
                        augment_task_id)

                    all_search_tasks.add(search_task_id)
            except:
                raise
                continue

        augment_task_id = previous_augment_task_id

    print(len(all_search_tasks))

    for i in range(len(all_tasks_output_dict["search_task_id"])):
        search_task = all_tasks_output_dict["search_task_id"][i]
        try:
            print(search_task)
            task_information = ming_proteosafe_library.get_task_information(
                "proteomics2.ucsd.edu", search_task)
            all_tasks_output_dict["search_description"].append(
                task_information["description"])
            for filename in task_information["files"]:
                if filename.find(".mzXML") != -1 or filename.find(
                        ".mzML") != -1:
                    all_spectrum_files_output_dict["spectrum_filename"].append(
                        filename)
                    all_spectrum_files_output_dict["search_task"].append(
                        search_task)
                    all_spectrum_files_output_dict[
                        "search_description"].append(
                            task_information["description"])
        except KeyboardInterrupt:
            raise
        except:
            all_tasks_output_dict["search_description"].append("")
            print("error", search_task)
            continue

    provenace_structure = {}
    provenace_structure["search_task_to_augment"] = search_task_to_augment
    provenace_structure[
        "search_task_to_extraction"] = search_task_to_extraction

    open(sys.argv[2], "w").write(json.dumps(provenace_structure, indent=4))

    ming_fileio_library.write_dictionary_table_data(all_tasks_output_dict,
                                                    sys.argv[3])
    ming_fileio_library.write_dictionary_table_data(all_augments_output_dict,
                                                    sys.argv[4])
    ming_fileio_library.write_dictionary_table_data(
        all_spectrum_files_output_dict, sys.argv[5])
def main():
    paramxml_input_filename = sys.argv[1]
    all_matches_filename = sys.argv[2]
    summary_filename = sys.argv[3]


    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    try:
       if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1":
           output_dict = {}
           ming_fileio_library.write_dictionary_table_data(output_dict, summary_filename)
           exit(0)
    except:
       output_dict = {}
       ming_fileio_library.write_dictionary_table_data(output_dict, summary_filename)
       exit(0)
    dataset_dict = []
    try:
        dataset_dict = ming_proteosafe_library.get_all_dataset_dict()
    except:
        dataset_dict = {}

    row_count, table_data = ming_fileio_library.parse_table_with_headers(all_matches_filename)

    matches_list = []
    for i in range(row_count):
        match = {}
        for key in table_data:
            match[key] = table_data[key][i]
        matches_list.append(match)

    matches_by_scan = defaultdict(list)
    for match in matches_list:
        query_spectrum_key = match["query_filename"] + ":" + match["query_scan"]

        matches_by_scan[query_spectrum_key].append(match)


    output_dict = defaultdict(list)
    for spectrum_key in matches_by_scan:
        contains_blank = 0
        datasets_contained = []
        compound_identifications = []
        spectrum_ids = []
        all_scores = []
        for match in matches_by_scan[spectrum_key]:
            if match["is_blank"] == "1":
                contains_blank = 1
            datasets_contained.append(match["dataset_id"])
            compound_identifications.append(match["identification"])
            spectrum_ids.append(match["spectrum_id"])
            all_scores.append(match["score"])
        datasets_contained = list(set(datasets_contained))
        compound_identifications = list(set(compound_identifications))
        spectrum_ids = list(set(spectrum_ids))
        dataset_descriptions = []

        for dataset_id in datasets_contained:
            dataset_descriptions.append(dataset_dict[dataset_id]["title"].strip())




        output_dict["query_scan"].append(matches_by_scan[spectrum_key][0]["query_scan"])
        output_dict["query_filename"].append(matches_by_scan[spectrum_key][0]["query_filename"])
        output_dict["dataset_list"].append("!".join(datasets_contained))
        output_dict["dataset_descriptions"].append("!".join(dataset_descriptions))
        output_dict["contains_blank"].append(contains_blank)
        output_dict["identification"].append("!".join(compound_identifications))
        output_dict["spectrum_id"].append("!".join(spectrum_ids))
        output_dict["best_score"].append(max(all_scores))




    for key in output_dict:
        print(key, len(output_dict[key]))

    ming_fileio_library.write_dictionary_table_data(output_dict, summary_filename)