コード例 #1
0
def main():
    paramxml_filename = sys.argv[1]
    psms_input_file = sys.argv[2]
    kl_input_file = sys.argv[3]
    output_psms_file = sys.argv[4]

    parameters_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_filename))

    row_count, kl_data = ming_fileio_library.parse_table_with_headers(
        kl_input_file)
    kl_dict = {}
    for i in range(row_count):
        filename = os.path.basename(kl_data["Filename"][i])
        scan = kl_data["Scan"][i]
        kl_strict = (kl_data["KL Strict"][i])
        kl_unstrict = (kl_data["KL"][i])
        interpeak_intensity = (kl_data["Interpeak intensity"][i])
        key = filename + ":" + str(scan)
        kl_dict[key] = {
            "kl_strict": kl_strict,
            "kl_unstrict": kl_unstrict,
            "kl_interpeak": interpeak_intensity
        }

    #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary
    psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers(
        psms_input_file)
    psm_table_data["kl_strict"] = []
    psm_table_data["kl_unstrict"] = []
    psm_table_data["kl_interpeak"] = []
    for i in range(psm_rows):
        key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i]
        if key in kl_dict:
            psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"])
            psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"])
            psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"])
        else:
            psm_table_data["kl_strict"].append(-1)
            psm_table_data["kl_unstrict"].append(-1)
            psm_table_data["kl_interpeak"].append(-1)

    #Change C to C+57
    #if "cysteine_protease.cysteine" in parameters_obj:
    #    if parameters_obj["cysteine_protease.cysteine"][0] == "c57":
    #        #Lets replace all the cysteines
    #        for i in range(psm_rows):
    #            psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57")

    ming_fileio_library.write_dictionary_table_data(psm_table_data,
                                                    output_psms_file)
コード例 #2
0
    def load_clustersummary(self, clustersummaryfilename):
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            clustersummaryfilename)

        for i in range(row_count):
            cluster_index = table_data["cluster index"][i]
            mz = table_data["precursor mass"][i]
            charge = table_data["precursor charge"][i]
            parentmass = table_data["parent mass"][i]
            number_of_spectra = table_data["number of spectra"][i]
            all_files = table_data["AllFiles"][i]

            componentindex = -1
            if "componentindex" in table_data:
                componentindex = table_data["componentindex"][i]

            cluster_node = ClusterNode(mz, charge, cluster_index,
                                       number_of_spectra, componentindex)
            cluster_node.all_files_string = all_files

            self.nodes.append(cluster_node)
            self.index_to_node_map[cluster_index] = cluster_node

            #Making all the nodes not shit in terms of clustering info
            constituent_spectra = cluster_node.all_files_string.split("###")
            cluster_node.constituent_spectra = constituent_spectra
コード例 #3
0
def main():
    parallel_json = json.loads(open(sys.argv[1]).read())
    params_filename = sys.argv[2]
    task_id_file = sys.argv[3]
    output_peptide_folder = sys.argv[4]
    output_psm_folder = sys.argv[5]
    #output_summary = sys.argv[5]
    params_dict = ming_proteosafe_library.parse_xml_file(open(params_filename))

    source_tasks_text = params_dict["tasks_to_consolidate"][0]

    row_count, task_file_table = ming_fileio_library.parse_table_with_headers(
        task_id_file)

    my_node = parallel_json["node_partition"]
    total_node = parallel_json["total_paritions"]

    output_summary = os.path.join(sys.argv[6], str(my_node))

    if len(source_tasks_text) > 0:
        source_tasks_list = json.loads(source_tasks_text)
        source_tasks_list += task_file_table["TASKID"]
        source_tasks_list.sort()
        source_tasks_list = source_tasks_list[my_node::total_node]
        grab_all_results(source_tasks_list, output_peptide_folder,
                         output_psm_folder, output_summary, params_dict)
    else:
        open(output_summary, "w").write("None")
コード例 #4
0
def parse_MSGF_tsvfile(filename):
    rows, table_data = ming_fileio_library.parse_table_with_headers(filename)

    scan_header = "Scan#"
    peptide_header = "Peptide"
    protein_header = "Protein"
    score_header = "P-value"
    filename_header = "#SpecFile"
    charge_header = "Charge"
    ppm_error_header = "PMError(ppm)"
    da_pm_error_header = "PMError(Da)"
    precursor_header = "Precursor"
    fragmethod_header = "FragMethod"

    parse_da_error = False
    if not ppm_error_header in table_data:
        parse_da_error = True

    decoy_indicator = "REV_"

    psm_list = []

    for i in range(rows):
        scan = table_data[scan_header][i]
        peptide = table_data[peptide_header][i]
        protein = table_data[protein_header][i]
        score = -math.log10(float(table_data[score_header][i]))
        #print table_data[score_header][i] + "\t" + str(score)
        filename = table_data[filename_header][i]
        charge = int(table_data[charge_header][i])
        frag_method = table_data[fragmethod_header][i]
        if parse_da_error:
            ppm_error = float(table_data[da_pm_error_header][i]) / float(
                table_data[precursor_header][i]) * 1000000
        else:
            ppm_error = float(table_data[ppm_error_header][i])
        decoy = 0

        #Stripping peptide dots
        if peptide[1] == "." and peptide[-2] == ".":
            peptide = peptide[2:-2]

        if protein.find(decoy_indicator) != -1:
            decoy = 1

        #Adding charge state to peptide name
        peptide += "." + str(charge)

        new_psm = PSM(filename,
                      scan,
                      peptide,
                      score,
                      decoy,
                      protein,
                      charge,
                      frag_method=frag_method)
        new_psm.ppm_error = ppm_error
        psm_list.append(new_psm)

    return psm_list
コード例 #5
0
def parse_variant_file(filename):
    rows, table_data = ming_fileio_library.parse_table_with_headers(filename)

    psm_list = []
    for i in range(rows):
        filename = table_data["filename"][i]
        scan = int(table_data["scan"][i])
        score = float(table_data["score"][i])
        decoy = int(table_data["decoy"][i])
        variant_sequence = table_data["variant_sequence"][i]
        charge = 0
        if "charge" in table_data:
            charge = int(table_data["charge"][i])
        else:
            charge = int(variant_sequence.split(".")[-1])
        protein = "NONE"

        if "unmangled_name" in table_data:
            filename = table_data["unmangled_name"][i]

        new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein,
                      charge)
        psm_list.append(new_psm)

    return psm_list
コード例 #6
0
def load_filename_to_coordinate_mapping(metadata_file):
    filename_map = {}

    line_counts, table_data = ming_fileio_library.parse_table_with_headers(metadata_file)

    if not("COORDINATE_X" in table_data and "COORDINATE_Y" in table_data and "COORDINATE_Z" in table_data):
        print("COORDINATE_X, COORDINATE_Y, COORDINATE_Z not present in metadata file for ili")
        exit(1)

    for i in range(line_counts):
        filename = table_data["filename"][i].rstrip()
        x = table_data["COORDINATE_X"][i].rstrip()
        y = table_data["COORDINATE_Y"][i].rstrip()
        z = table_data["COORDINATE_Z"][i].rstrip()
        radius = "0.25"
        if "COORDINATE_radius" in table_data:
            radius = table_data["COORDINATE_radius"][i].rstrip()

        if len(x) < 1:
            continue

        coordinate_object = {}
        coordinate_object["x"] = x
        coordinate_object["y"] = y
        coordinate_object["z"] = z
        coordinate_object["radius"] = radius

        filename_map[filename] = coordinate_object

    return filename_map
コード例 #7
0
def load_filename_to_coordinate_mapping(metadata_file):
    filename_map = {}

    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        metadata_file)

    if not ("COORDINATE_X" in table_data and "COORDINATE_Y" in table_data
            and "COORDINATE_Z" in table_data):
        print(
            "COORDINATE_X, COORDINATE_Y, COORDINATE_Z not present in metadata file for ili"
        )
        exit(1)

    for i in range(line_counts):
        filename = table_data["filename"][i].rstrip()
        x = table_data["COORDINATE_X"][i].rstrip()
        y = table_data["COORDINATE_Y"][i].rstrip()
        z = table_data["COORDINATE_Z"][i].rstrip()
        radius = "0.25"
        if "COORDINATE_radius" in table_data:
            radius = table_data["COORDINATE_radius"][i].rstrip()

        if len(x) < 1:
            continue

        coordinate_object = {}
        coordinate_object["x"] = x
        coordinate_object["y"] = y
        coordinate_object["z"] = z
        coordinate_object["radius"] = radius

        filename_map[filename] = coordinate_object

    return filename_map
コード例 #8
0
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
コード例 #9
0
def add_library_search_results_to_graph(G, library_search_filename):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(library_search_filename)

    for i in range(row_count):
        cluster_index = table_data["#Scan#"][i]

        if cluster_index in G.node:
            G.node[cluster_index]["Adduct"] = str(table_data["Adduct"][i].encode('ascii', 'ignore'))
            G.node[cluster_index]["Compound_Name"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Compound_Name"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index]["Adduct"] = str(table_data["Adduct"][i])
            G.node[cluster_index]["INCHI"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["INCHI"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index]["Smiles"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Smiles"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index]["MQScore"] = str(table_data["MQScore"][i])
            G.node[cluster_index]["MassDiff"] = str(table_data["MassDiff"][i])
            G.node[cluster_index]["MZErrorPPM"] = str(table_data["MZErrorPPM"][i])
            G.node[cluster_index]["SharedPeaks"] = str(table_data["SharedPeaks"][i])
            G.node[cluster_index]["tags"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["tags"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index]["Library_Class"] = str(table_data["Library_Class"][i])
            G.node[cluster_index]["Instrument"] = str(table_data["Instrument"][i])
            G.node[cluster_index]["IonMode"] = str(table_data["IonMode"][i])
            G.node[cluster_index]["Ion_Source"] = str(table_data["Ion_Source"][i])
            G.node[cluster_index]["PI"] = str(table_data["PI"][i])
            G.node[cluster_index]["Data_Collector"] = str(table_data["Data_Collector"][i])
            G.node[cluster_index]["Compound_Source"] = str(table_data["Compound_Source"][i])
            G.node[cluster_index]["SpectrumID"] = str(table_data["SpectrumID"][i])
            G.node[cluster_index]["GNPSLibraryURL"] = "http://gnps.ucsd.edu/ProteoSAFe/gnpslibraryspectrum.jsp?SpectrumID=" + table_data["SpectrumID"][i]
コード例 #10
0
def parse_input_consensus_feature(tsv_file):
    rows, table_data = ming_fileio_library.parse_table_with_headers(tsv_file)
    headers = table_data.keys()
    print headers

    #Finding all file names
    data_filenames = []
    for header in headers:
        if header.find("_MZ") != -1:
            data_filenames.append(header[:-3])

    consensus_features = []

    for i in range(rows):
        file_feature_map = {}
        for filename in data_filenames:
            intensity_key = filename
            mz_key = filename + "_MZ"
            rt_key = filename + "_RT"

            intensity_value = float(table_data[intensity_key][i])
            mz_value = float(table_data[mz_key][i])
            rt_value = float(table_data[rt_key][i])

            file_feature = LC_Feature(filename, mz_value, rt_value,
                                      intensity_value)
            file_feature_map[filename] = file_feature

        consensus_feature = ConsensusFeature(table_data["#FeatureID"][i],
                                             file_feature_map)
        consensus_features.append(consensus_feature)

    return consensus_features
コード例 #11
0
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
コード例 #12
0
def main():
    input_param = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))
    input_folder = sys.argv[2]
    output_file = sys.argv[3]
    scratch_folder = sys.argv[4]
    path_to_executable = sys.argv[5]
    path_to_isotopes_table = sys.argv[6]

    #parent_mass_tolerance = input_param[]
    parent_mass_tolerance = 0.05

    all_input_file_paths = ming_fileio_library.list_files_in_dir(input_folder)

    output_kl_intermediates = []
    for input_file in all_input_file_paths:
        output_kl_file = os.path.join(scratch_folder,
                                      os.path.basename(input_file) + ".kl")
        cmd = path_to_executable + " --input " + input_file + " --output_summary " + output_kl_file + " " + "--peak_tolerance " + str(
            parent_mass_tolerance
        ) + " --isotope_file " + path_to_isotopes_table + "  >/dev/null 2>&1 "
        print(cmd)
        os.system(cmd)
        #subprocess.call([cmd])
        output_kl_intermediates.append(output_kl_file)

    combined_table = defaultdict(list)
    for output_kl_file in output_kl_intermediates:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            output_kl_file)
        for key in table_data:
            combined_table[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(combined_table,
                                                    output_file)
コード例 #13
0
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename,
                                   clusterinfosummary_filename,
                                   output_filename):
    param_object = ming_proteosafe_library.parse_xml_file(
        open(param_filename, "r"))
    output_file = open(output_filename, "w")
    if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1":
        output_file.write("No Output")
        return

    test_network = molecular_network_library.MolecularNetwork()
    test_network.load_clustersummary(clusterinfosummary_filename)

    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        cluster_info_filename)

    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if test_network.get_cluster_index(cluster_number) == None:
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][
            mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {
            "filename": table_data["#Filename"][i],
            "intensity": table_data["#PrecIntensity"][i]
        }
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header = "#OTU ID\t"
    for header in mangled_mapping.keys():
        output_header += os.path.basename(mangled_mapping[header]) + "\t"

    output_file.write(output_header + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            line_string += str(
                cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        output_file.write(line_string + "\n")
def main():
    params = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))
    proteome = ming_protein_library.parse_fasta_proteome_file(sys.argv[2])

    row_count, table_data = ming_fileio_library.parse_table_with_headers(sys.argv[3])
    decoy_marker = sys.argv[5]

    add_decoy_to_results(table_data, row_count, decoy_marker)
    psm_results = add_fdr_to_results(table_data, row_count)

    output_table = defaultdict(list)

    #Performing filters
    filter_type = params["filter.filter"][0]
    if filter_type == "FDR":
        fdr_threshold = float(params["FDR.FDR"][0])
        for psm in psm_results:
            if psm["QValue"] < fdr_threshold:
                for key in psm:
                    output_table[key].append(psm[key])
    if filter_type == "PepFDR":
        fdr_threshold = float(params["PepFDR.PepFDR"][0])
        for psm in psm_results:
            if psm["PepQValue"] < fdr_threshold and psm["QValue"] < fdr_threshold:
                for key in psm:
                    output_table[key].append(psm[key])
    if filter_type == "FPR":
        print("Lets do nothing, don't know what this is")

    ming_fileio_library.write_dictionary_table_data(output_table, sys.argv[4])
コード例 #15
0
def parse_input_consensus_feature(tsv_file):
    rows, table_data = ming_fileio_library.parse_table_with_headers(tsv_file)
    headers = table_data.keys()
    print headers

    #Finding all file names
    data_filenames = []
    for header in headers:
        if header.find("_MZ") != -1:
            data_filenames.append(header[:-3])

    consensus_features = []

    for i in range(rows):
        file_feature_map = {}
        for filename in data_filenames:
            intensity_key = filename
            mz_key = filename + "_MZ"
            rt_key = filename + "_RT"

            intensity_value = float(table_data[intensity_key][i])
            mz_value = float(table_data[mz_key][i])
            rt_value = float(table_data[rt_key][i])

            file_feature = LC_Feature(filename, mz_value, rt_value, intensity_value)
            file_feature_map[filename] = file_feature

        consensus_feature = ConsensusFeature(table_data["#FeatureID"][i], file_feature_map)
        consensus_features.append(consensus_feature)
    
    return consensus_features
コード例 #16
0
def add_library_search_results_to_graph(G, library_search_filename, annotation_prefix=""):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(library_search_filename)

    for i in range(row_count):
        cluster_index = table_data["#Scan#"][i]

        if cluster_index in G.node:
            G.node[cluster_index][annotation_prefix + "Adduct"] = str(table_data["Adduct"][i].encode('ascii', 'ignore'))
            G.node[cluster_index][annotation_prefix + "Compound_Name"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Compound_Name"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index][annotation_prefix + "Adduct"] = str(table_data["Adduct"][i])
            G.node[cluster_index][annotation_prefix + "INCHI"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["INCHI"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index][annotation_prefix + "Smiles"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["Smiles"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index][annotation_prefix + "MQScore"] = str(table_data["MQScore"][i])
            G.node[cluster_index][annotation_prefix + "MassDiff"] = str(table_data["MassDiff"][i])
            G.node[cluster_index][annotation_prefix + "MZErrorPPM"] = str(table_data["MZErrorPPM"][i])
            G.node[cluster_index][annotation_prefix + "SharedPeaks"] = str(table_data["SharedPeaks"][i])
            G.node[cluster_index][annotation_prefix + "tags"] = str(''.join([j if ord(j) < 128 else ' ' for j in table_data["tags"][i]]).replace("\\", "\\\\"))
            G.node[cluster_index][annotation_prefix + "Library_Class"] = str(table_data["Library_Class"][i])
            G.node[cluster_index][annotation_prefix + "Instrument"] = str(table_data["Instrument"][i])
            G.node[cluster_index][annotation_prefix + "IonMode"] = str(table_data["IonMode"][i])
            G.node[cluster_index][annotation_prefix + "Ion_Source"] = str(table_data["Ion_Source"][i])
            G.node[cluster_index][annotation_prefix + "PI"] = str(table_data["PI"][i])
            G.node[cluster_index][annotation_prefix + "Data_Collector"] = str(table_data["Data_Collector"][i])
            G.node[cluster_index][annotation_prefix + "Compound_Source"] = str(table_data["Compound_Source"][i])
            G.node[cluster_index][annotation_prefix + "SpectrumID"] = str(table_data["SpectrumID"][i])
            G.node[cluster_index][annotation_prefix + "GNPSLibraryURL"] = "http://gnps.ucsd.edu/ProteoSAFe/gnpslibraryspectrum.jsp?SpectrumID=" + table_data["SpectrumID"][i]
コード例 #17
0
def load_features_table(input_filename):
    feature_list = []
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename)
    for i in range(line_counts):
        feature = Feature(float(table_data["#rt"][i]), float(table_data["mz"][i]), float(table_data["intensity"][i]))
        feature_list.append(feature)
    return feature_list
コード例 #18
0
def main():
    input_results_filename = sys.argv[1]
    input_peptide_list_filename = sys.argv[2]

    products_to_rt_map = parse_identification_file(input_results_filename)
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        input_peptide_list_filename)
    all_peptides = table_data["Peptides"]

    full_peptides_to_rt = map_products_to_peptide_rt(products_to_rt_map,
                                                     all_peptides)
    partitioned_peptide_list = partition_peptides_random(
        full_peptides_to_rt, 3)
    #partitioned_peptide_list = partition_peptides_number_products(full_peptides_to_rt, 3)

    print "Total Products: " + str(len(products_to_rt_map))
    total_detectable_products = 0
    for peptide_list in partitioned_peptide_list:
        number_products_detectable = count_number_of_acquireable_products(
            peptide_list, full_peptides_to_rt)
        #print number_products_detectable
        total_detectable_products += number_products_detectable
    print "Total Products Detectable: " + str(total_detectable_products)

    for peptide_list in partitioned_peptide_list:
        print "Partition================="
        for peptide in peptide_list:
            print peptide
コード例 #19
0
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename):
    output_file = open(output_filename, "w")
    test_network = molecular_network_library.MolecularNetwork()
    test_network.load_clustersummary(clusterinfosummary_filename)
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if test_network.get_cluster_index(cluster_number) == None:
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    all_headers = ["filename", "X", "Y", "Z", "radius"]
    for cluster_idx in cluster_index_to_file_map:
        all_headers.append(cluster_idx)

    #writing header
    output_file.write(",".join(all_headers) + "\n")

    for sample_name in mangled_mapping:
        if sample_name.find("spec") == -1:
            continue
        real_filename = mangled_mapping[sample_name]

        if not os.path.basename(real_filename) in filename_coordinate_mapping:
            continue

        line_output = [real_filename]
        coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)]
        line_output.append(coordinate_object["x"])
        line_output.append(coordinate_object["y"])
        line_output.append(coordinate_object["z"])
        line_output.append(coordinate_object["radius"])
        print(line_output, coordinate_object)
        for cluster_idx in cluster_index_to_file_map:
            line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name]))
        output_file.write(",".join(line_output) + "\n")

    output_file.close()
コード例 #20
0
def load_features_table(input_filename):
    feature_list = []
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        input_filename)
    for i in range(line_counts):
        feature = Feature(float(table_data["#rt"][i]),
                          float(table_data["mz"][i]),
                          float(table_data["intensity"][i]))
        feature_list.append(feature)
    return feature_list
コード例 #21
0
def parse_MSGFPlus_tsvfile(filename):
    rows, table_data = ming_fileio_library.parse_table_with_headers(filename)

    scan_header = "ScanNum"
    peptide_header = "Peptide"
    protein_header = "Protein"
    score_header = "EValue"
    filename_header = "#SpecFile"
    charge_header = "Charge"
    ppm_error_header = "PrecursorError(ppm)"
    da_pm_error_header = "PrecursorError(Da)"
    precursor_header = "Precursor"
    frag_method_header = "FragMethod"

    parse_da_error = False
    if not ppm_error_header in table_data:
        parse_da_error = True


    decoy_indicator = "XXX_"

    psm_list = []

    for i in range(rows):
        scan = table_data[scan_header][i]
        peptide = table_data[peptide_header][i]
        protein = table_data[protein_header][i]
        score = -math.log10(float(table_data[score_header][i]))
        #print table_data[score_header][i] + "\t" + str(score)
        filename = table_data[filename_header][i]
        charge = int(table_data[charge_header][i])
        frag_method = table_data[frag_method_header][i]
        if parse_da_error:
            ppm_error = float(table_data[da_pm_error_header][i])/float(table_data[precursor_header][i]) * 1000000
        else:
            ppm_error = float(table_data[ppm_error_header][i])
        decoy = 0

        #Stripping peptide dots
        if peptide[1] == "." and peptide[-2] == ".":
            peptide = peptide[2:-2]


        if protein.find(decoy_indicator) != -1:
            decoy = 1

        #Adding charge state to peptide name
        peptide += "." + str(charge)

        new_psm = PSM(filename, scan, peptide, score, decoy, protein, charge)
        new_psm.ppm_error = ppm_error
        new_psm.frag_method = frag_method
        psm_list.append(new_psm)

    return psm_list
コード例 #22
0
def load_masses(input_filename):
    masses_list = []
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename)

    for i in range(line_counts):
        masses_list.append([table_data["Resolaveability"][i], table_data["Peptide"][i], float(table_data["m/z"][i])])

    #Sort this mofo
    sorted_peptide_mass_list = sorted(masses_list, key=lambda pep_obj: pep_obj[2])

    return sorted_peptide_mass_list
コード例 #23
0
def load_variant_to_score(filtered_filename):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        filtered_filename)

    variant_to_score = {}
    for i in range(row_count):
        variant = table_data["variant_sequence"][i]
        score = float(table_data["score"][i])
        variant_to_score[variant] = score

    return variant_to_score
コード例 #24
0
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    all_headers = ["filename", "X", "Y", "Z", "radius"]
    for cluster_idx in cluster_index_to_file_map:
        all_headers.append(cluster_idx)

    #writing header
    output_file.write(",".join(all_headers) + "\n")

    for sample_name in mangled_mapping:
        if sample_name.find("spec") == -1:
            continue
        real_filename = mangled_mapping[sample_name]

        if not os.path.basename(real_filename) in filename_coordinate_mapping:
            continue

        line_output = [real_filename]
        coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)]
        line_output.append(coordinate_object["x"])
        line_output.append(coordinate_object["y"])
        line_output.append(coordinate_object["z"])
        line_output.append(coordinate_object["radius"])
        print(line_output, coordinate_object)
        for cluster_idx in cluster_index_to_file_map:
            line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name]))
        output_file.write(",".join(line_output) + "\n")

    output_file.close()
コード例 #25
0
def load_precursor_to_protein_mapping(input_filename):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_filename)

    precursor_to_protein_map = {}
    for i in range(row_count):
        precursor_string = table_data["original_peptide"][i]
        protein_string = table_data["proteins_mapped"][i]
        precursor_to_protein_map[precursor_string] = protein_string

    return precursor_to_protein_map
コード例 #26
0
def load_score_cutoff_by_length(filtered_filename):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        filtered_filename)

    score_cutoff_by_length = defaultdict(lambda: 10000)
    for i in range(row_count):
        length = int(table_data["length"][i])
        score = float(table_data["score"][i])
        score_cutoff_by_length[length] = min(score,
                                             score_cutoff_by_length[length])

    return score_cutoff_by_length
コード例 #27
0
def get_scan_mapping_for_collision_method(path_to_original_results):
    mapping_dict = {}
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        path_to_original_results)
    print(path_to_original_results)

    scan_header = "Scan#"
    if not scan_header in table_data:
        scan_header = "ScanNum"

    for i in range(row_count):
        key = table_data["#SpecFile"][i] + "_" + table_data[scan_header][i]
        mapping_dict[key] = table_data["FragMethod"][i]
    return mapping_dict
コード例 #28
0
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder)
    output_map = defaultdict(list)
    for parallel_output_filename in all_intermediate_files:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(parallel_output_filename)
        for key in table_data:
            output_map[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
コード例 #29
0
def main():
    input_folder_path = sys.argv[1]
    output_tsv = sys.argv[2]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)

    merged_dict = defaultdict(list)

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file)
        for key in table_data:
            merged_dict[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
コード例 #30
0
def proteins_to_include(input_filename):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_filename)

    protein_set = set()
    for i in range(row_count):
        fdr = float(table_data["fdr"][i])
        protein = table_data["protein"][i]
        number_of_non_overlapping_sequences = len(
            table_data["number_of_non_overlapping_sequences"][i])

        if fdr <= 0.01 and number_of_non_overlapping_sequences > 1:
            protein_set.add(protein)

    return protein_set
コード例 #31
0
	def load_gnps_librarysearch(self, identification_filename):
		row_count, table_data = ming_fileio_library.parse_table_with_headers(identification_filename)
		for i in range(row_count):
			compound_name = table_data["Compound_Name"][i]
			smiles = table_data["Smiles"][i]
			inchi = table_data["INCHI"][i]
			SpectrumID = table_data["SpectrumID"][i]
			score = table_data["MQScore"][i]
			scan = table_data["#Scan#"][i]
			identification = ClusterLibraryIdentification(SpectrumID, compound_name, smiles, inchi, score, scan)
			self.identifications.append(identification)

			#Finding the cluster
			if scan in self.index_to_node_map:
				self.index_to_node_map[scan].library_identifications.append(identification)
コード例 #32
0
def main():
    input_folder_path = sys.argv[1]
    param_xml_filename = sys.argv[2]
    output_tsv = sys.argv[3]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)
    params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename))

    top_k = 1
    try:
        top_k = int(params_obj["TOP_K_RESULTS"][0])
    except:
        top_k = 1

    #merged_dict = defaultdict(list)
    merged_results = []

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file)
        for i in range(row_count):
            result_dict = {}
            for key in table_data:
                result_dict[key] = table_data[key][i]
            merged_results.append(result_dict)


    results_per_spectrum = defaultdict(list)

    for result_obj in merged_results:
        spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"]

        results_per_spectrum[spectrum_unique_key].append(result_obj)

    output_results = []
    for spectrum_unique_key in results_per_spectrum:
        sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True)
        filtered_results = sorted_results[:top_k]
        output_results += filtered_results

    output_dict = defaultdict(list)

    for result_obj in output_results:
        for key in result_obj:
            output_dict[key].append(result_obj[key])


    ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
コード例 #33
0
def main():
    input_folder_path = sys.argv[1]
    output_tsv = sys.argv[2]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)

    merged_dict = defaultdict(list)

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            input_file)
        for key in table_data:
            merged_dict[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
コード例 #34
0
def load_masses(input_filename):
    masses_list = []
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        input_filename)

    for i in range(line_counts):
        masses_list.append([
            table_data["Resolaveability"][i], table_data["Peptide"][i],
            float(table_data["m/z"][i])
        ])

    #Sort this mofo
    sorted_peptide_mass_list = sorted(masses_list,
                                      key=lambda pep_obj: pep_obj[2])

    return sorted_peptide_mass_list
コード例 #35
0
def load_metadata_mapping(metadata_folder):
    file_name_to_sample_id_mapping = {}
    all_files = ming_fileio_library.list_files_in_dir(metadata_folder)

    if len(all_files) != 1:
        return {}

    row_count, table_data = ming_fileio_library.parse_table_with_headers(all_files[0])

    for i in range(row_count):
        filename = table_data["filename"][i]
        sample_id = table_data["#SampleID"][i]

        file_name_to_sample_id_mapping[filename] = sample_id

    return file_name_to_sample_id_mapping
コード例 #36
0
def parse_psm_file(filename, load_extra_metadata=False):
    rows, table_data = ming_fileio_library.parse_table_with_headers(filename)

    known_headers = [
        "filename", "scan", "score", "decoy", "sequence", "charge",
        "ppm_error", "unmangled_name", "FDR", "collision_energy", "FragMethod"
    ]
    extra_metadata_headers = set(table_data.keys()).difference(
        set(known_headers))

    psm_list = []
    for i in range(rows):
        filename = table_data["filename"][i]
        scan = int(table_data["scan"][i])
        score = float(table_data["score"][i])
        decoy = int(table_data["decoy"][i])
        variant_sequence = table_data["sequence"][i]
        charge = int(table_data["charge"][i])
        ppm_error = float(table_data["ppm_error"][i])
        fdr = float(table_data["FDR"][i])
        fragmentation_method = "N/A"
        if "FragMethod" in table_data:
            fragmentation_method = table_data["FragMethod"][i]
        collision_energy = 0.0
        if "collision_energy" in table_data:
            collision_energy = float(table_data["collision_energy"][i])
        protein = "NONE"

        if "unmangled_name" in table_data:
            filename = table_data["unmangled_name"][i]

        new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein,
                      charge)
        new_psm.ppm_error = ppm_error
        new_psm.fdr = fdr
        new_psm.frag_method = fragmentation_method
        new_psm.collision_energy = collision_energy

        if load_extra_metadata:
            extra_metadata = {}
            for header in extra_metadata_headers:
                extra_metadata[header] = table_data[header][i]
            new_psm.extra_metadata = extra_metadata

        psm_list.append(new_psm)

    return psm_list
コード例 #37
0
def main():
    input_filename = sys.argv[1]
    ppm_tolerance = float(sys.argv[2])
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        input_filename)

    all_sub_peptides = []

    for i in range(line_counts):
        #print table_data["Peptides"][i]
        peptide = table_data["Peptides"][i]
        all_sub_peptides.append(peptide)
        for length in range(10):
            #substrings = find_all_substring_of_length(peptide, length + 4)
            substrings = [peptide[:length + 4], peptide[length + 4:]]
            #print peptide + "\t" + str(substrings)
            all_sub_peptides += substrings

    #print len(all_sub_peptides)
    all_sub_peptides = list(set(all_sub_peptides))
    #print len(all_sub_peptides)
    peptide_mass_map = {}
    for peptide in all_sub_peptides:
        peptide_key = peptide + ".2"
        peptide_mass = mass.calculate_mass(sequence=peptide,
                                           ion_type='M',
                                           charge=2)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".3"
        peptide_mass = mass.calculate_mass(sequence=peptide,
                                           ion_type='M',
                                           charge=3)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".4"
        peptide_mass = mass.calculate_mass(sequence=peptide,
                                           ion_type='M',
                                           charge=4)
        peptide_mass_map[peptide_key] = peptide_mass

        #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2))
        #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3))
        #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))

    #Determine uniqueness
    find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
def load_group_attribute_mappings(metadata_filename):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_filename)
    filename_header = "filename"

    attributes_to_groups_mapping = defaultdict(set)
    group_to_files_mapping = defaultdict(list)
    for key in table_data:
        all_group_names = []
        if key.find("ATTRIBUTE_") != -1:
            #Determine unique values in this column
            for i in range(row_count):
                filename = table_data[filename_header][i].rstrip()
                if len(filename) > 2:
                    group_to_files_mapping[table_data[key][i]].append(filename)
                    attributes_to_groups_mapping[key].add(table_data[key][i])

    return group_to_files_mapping, attributes_to_groups_mapping
コード例 #39
0
def main():
    input_folder = sys.argv[1]
    input_tsvfile = sys.argv[2]
    output_tsvfile = sys.argv[3]

    allowed_passthrough_extensions = []
    extension_conversion_mapping = {}

    for i in range(4, len(sys.argv)):
        print(i)
        conversion_parameter = sys.argv[i]
        print(conversion_parameter)
        from_extension = conversion_parameter.split(":")[0]
        to_extension = conversion_parameter.split(":")[1]
        extension_conversion_mapping[from_extension] = to_extension

        if from_extension == to_extension:
            allowed_passthrough_extensions.append(from_extension)

    file_renaming_reverse_mapping = {}

    all_input_files = [
        os.path.join(input_folder, f) for f in os.listdir(input_folder)
        if os.path.isfile(os.path.join(input_folder, f))
    ]
    for input_file in all_input_files:
        input_extension = os.path.splitext(input_file)[1][1:]
        if input_extension in extension_conversion_mapping:
            renamed = os.path.splitext(
                os.path.basename(input_file)
            )[0] + "." + extension_conversion_mapping[input_extension]
            file_renaming_reverse_mapping[renamed] = os.path.basename(
                input_file)

    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_tsvfile)

    for header in table_data:
        for i in range(row_count):
            for find_to_replace in file_renaming_reverse_mapping:
                table_data[header][i] = table_data[header][i].replace(
                    find_to_replace,
                    file_renaming_reverse_mapping[find_to_replace])

    ming_fileio_library.write_dictionary_table_data(table_data, output_tsvfile)
コード例 #40
0
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    output_map = defaultdict(list)
    for parallel_output_filename in all_intermediate_files:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            parallel_output_filename)
        for key in table_data:
            output_map[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_filename)
コード例 #41
0
    def load_gnps_librarysearch(self, identification_filename):
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            identification_filename)
        for i in range(row_count):
            compound_name = table_data["Compound_Name"][i]
            smiles = table_data["Smiles"][i]
            inchi = table_data["INCHI"][i]
            SpectrumID = table_data["SpectrumID"][i]
            score = table_data["MQScore"][i]
            scan = table_data["#Scan#"][i]
            identification = ClusterLibraryIdentification(
                SpectrumID, compound_name, smiles, inchi, score, scan)
            self.identifications.append(identification)

            #Finding the cluster
            if scan in self.index_to_node_map:
                self.index_to_node_map[scan].library_identifications.append(
                    identification)
コード例 #42
0
def load_identification_file_as_map(input_results_filename):
    print("Loading", input_results_filename)
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_results_filename)

    identification_map = {}

    for i in range(row_count):
        scan_number = int(table_data["#Scan#"][i])
        identification = table_data["Compound_Name"][i]
        spectrum_id = table_data["SpectrumID"][i]

        identification_dict = {}
        identification_dict["identification"] = identification
        identification_dict["spectrum_id"] = spectrum_id

        identification_map[scan_number] = identification_dict

    return identification_map
コード例 #43
0
def parse_msplit_file(filename, load_extra_metadata=False):
    rows, table_data = ming_fileio_library.parse_table_with_headers(filename)

    known_headers = ["filename", "scan", "score", "decoy", "sequence", "charge", "ppm_error", "unmangled_name", "FDR", "collision_energy", "FragMethod"]
    extra_metadata_headers = set(table_data.keys()).difference(set(known_headers))

    psm_list = []
    for i in range(rows):
        filename = table_data["internalFilename"][i]
        scan = int(table_data["Scan#"][i])
        score = table_data["cosine(M,A)"][i]
        decoy = 0
        variant_sequence = table_data["Annotation"][i]
        charge = table_data["Charge"][i]
        protein = "NONE"

        new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein, charge)
        psm_list.append(new_psm)
    return psm_list
コード例 #44
0
def parse_psm_file(filename, load_extra_metadata=False):
    rows, table_data = ming_fileio_library.parse_table_with_headers(filename)

    known_headers = ["filename", "scan", "score", "decoy", "sequence", "charge", "ppm_error", "unmangled_name", "FDR", "collision_energy", "FragMethod"]
    extra_metadata_headers = set(table_data.keys()).difference(set(known_headers))

    psm_list = []
    for i in range(rows):
        filename = table_data["filename"][i]
        scan = int(table_data["scan"][i])
        score = float(table_data["score"][i])
        decoy = int(table_data["decoy"][i])
        variant_sequence = table_data["sequence"][i]
        charge = int(table_data["charge"][i])
        ppm_error = float(table_data["ppm_error"][i])
        fdr = float(table_data["FDR"][i])
        fragmentation_method = "N/A"
        if "FragMethod" in table_data:
            fragmentation_method = table_data["FragMethod"][i]
        collision_energy = 0.0
        if "collision_energy" in table_data:
            collision_energy = float(table_data["collision_energy"][i])
        protein = "NONE"

        if "unmangled_name" in table_data:
            filename = table_data["unmangled_name"][i]

        new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein, charge)
        new_psm.ppm_error = ppm_error
        new_psm.fdr = fdr
        new_psm.frag_method = fragmentation_method
        new_psm.collision_energy = collision_energy

        if load_extra_metadata:
            extra_metadata = {}
            for header in extra_metadata_headers:
                extra_metadata[header] = table_data[header][i]
            new_psm.extra_metadata = extra_metadata

        psm_list.append(new_psm)

    return psm_list
コード例 #45
0
def main():
    input_filename = sys.argv[1]
    ppm_tolerance = float(sys.argv[2])
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename)

    all_sub_peptides = []

    for i in range(line_counts):
        #print table_data["Peptides"][i]
        peptide = table_data["Peptides"][i]
        all_sub_peptides.append(peptide)
        for length in range(10):
            #substrings = find_all_substring_of_length(peptide, length + 4)
            substrings = [peptide[:length+4], peptide[length+4:]]
            #print peptide + "\t" + str(substrings)
            all_sub_peptides += substrings

    #print len(all_sub_peptides)
    all_sub_peptides = list(set(all_sub_peptides))
    #print len(all_sub_peptides)
    peptide_mass_map = {}
    for peptide in all_sub_peptides:
        peptide_key = peptide + ".2"
        peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=2)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".3"
        peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=3)
        peptide_mass_map[peptide_key] = peptide_mass

        peptide_key = peptide + ".4"
        peptide_mass = mass.calculate_mass(sequence=peptide, ion_type='M', charge=4)
        peptide_mass_map[peptide_key] = peptide_mass


        #print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2))
        #print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3))
        #print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))


    #Determine uniqueness
    find_resolveable_peptides(peptide_mass_map, ppm_tolerance)
コード例 #46
0
def main():
    input_filename = sys.argv[1]
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_filename)

    all_sub_peptides = []

    for i in range(line_counts):
        #print table_data["Peptides"][i]
        for length in range(10):
            peptide = table_data["Peptides"][i]
            substrings = find_all_substring_of_length(peptide, length + 4)
            #print peptide + "\t" + str(substrings)
            all_sub_peptides += substrings

    #print len(all_sub_peptides)
    all_sub_peptides = list(set(all_sub_peptides))
    #print len(all_sub_peptides)
    for peptide in all_sub_peptides:
        print peptide + "\t" + "2" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=2))
        print peptide + "\t" + "3" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=3))
        print peptide + "\t" + "4" + "\t" + str(mass.calculate_mass(sequence=peptide, ion_type='M', charge=4))
コード例 #47
0
def main():
    input_results_filename = sys.argv[1]
    input_peptide_list_filename = sys.argv[2]

    products_to_rt_map = parse_identification_file(input_results_filename)
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(input_peptide_list_filename)
    all_peptides = table_data["Peptides"]

    full_peptides_to_rt = map_products_to_peptide_rt(products_to_rt_map, all_peptides)
    partitioned_peptide_list = partition_peptides_random(full_peptides_to_rt, 3)
    #partitioned_peptide_list = partition_peptides_number_products(full_peptides_to_rt, 3)

    print "Total Products: " + str(len(products_to_rt_map))
    total_detectable_products = 0
    for peptide_list in partitioned_peptide_list:
        number_products_detectable = count_number_of_acquireable_products(peptide_list, full_peptides_to_rt)
        #print number_products_detectable
        total_detectable_products += number_products_detectable
    print "Total Products Detectable: " + str(total_detectable_products)

    for peptide_list in partitioned_peptide_list:
        print "Partition================="
        for peptide in peptide_list:
            print peptide
コード例 #48
0
def parse_variant_file(filename):
    rows, table_data = ming_fileio_library.parse_table_with_headers(filename)

    psm_list = []
    for i in range(rows):
        filename = table_data["filename"][i]
        scan = int(table_data["scan"][i])
        score = float(table_data["score"][i])
        decoy = int(table_data["decoy"][i])
        variant_sequence = table_data["variant_sequence"][i]
        charge = 0
        if "charge" in table_data:
            charge = int(table_data["charge"][i])
        else:
            charge = int(variant_sequence.split(".")[-1])
        protein = "NONE"

        if "unmangled_name" in table_data:
            filename = table_data["unmangled_name"][i]

        new_psm = PSM(filename, scan, variant_sequence, score, decoy, protein, charge)
        psm_list.append(new_psm)

    return psm_list
コード例 #49
0
	def load_pairsinfo(self, pairs_filename):
		row_count, table_data = ming_fileio_library.parse_table_with_headers(pairs_filename)

		if "CLUSTERID1" in table_data:
			for i in range(row_count):
				node1 = table_data["CLUSTERID1"][i]
				node2 = table_data["CLUSTERID2"][i]
				cosine = table_data["Cosine"][i]
				deltamz = table_data["DeltaMZ"][i]
				pair = NetworkPair(node1, node2, cosine, deltamz)
				self.pairs.append(pair)
		else:
			row_count, table_data = ming_fileio_library.parse_table_without_headers(pairs_filename)
			for i in range(row_count):
				node1 = table_data[0][i]
				node2 = table_data[1][i]
				cosine = table_data[4][i]
				deltamz = table_data[2][i]
				pair = NetworkPair(node1, node2, cosine, deltamz)
				self.pairs.append(pair)



		#Make stuff consistent, specifically adding adjacency list
		for pair in self.pairs:
			node1 = pair.node1
			node2 = pair.node2

			if not(node1 in self.index_to_neighbors):
				self.index_to_neighbors[node1] = []

			if not(node2 in self.index_to_neighbors):
				self.index_to_neighbors[node2] = []

			self.index_to_neighbors[node1].append(node2)
			self.index_to_neighbors[node2].append(node1)
コード例 #50
0
	def load_clustersummary(self, clustersummaryfilename):
		row_count, table_data = ming_fileio_library.parse_table_with_headers(clustersummaryfilename)

		for i in range(row_count):
			cluster_index = table_data["cluster index"][i]
			mz = table_data["precursor mass"][i]
			charge = table_data["precursor charge"][i]
			parentmass = table_data["parent mass"][i]
			number_of_spectra = table_data["number of spectra"][i]
			all_files = table_data["AllFiles"][i]

			componentindex = -1
			if "componentindex" in table_data:
				componentindex = table_data["componentindex"][i]

			cluster_node = ClusterNode(mz, charge, cluster_index, number_of_spectra, componentindex)
			cluster_node.all_files_string = all_files

			self.nodes.append(cluster_node)
			self.index_to_node_map[cluster_index] = cluster_node

			#Making all the nodes not shit in terms of clustering info
			constituent_spectra = cluster_node.all_files_string.split("###")
			cluster_node.constituent_spectra = constituent_spectra
コード例 #51
0
def main():
    input_result_filename = sys.argv[1]
    output_result_filename = sys.argv[2]

    spectrum_id_cache = {}


    input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename)

    output_table = defaultdict(list)

    output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"]
    output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"]
    output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"]
    output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"]

    for header in output_headers:
        output_table[header] = []

    number_hits_per_query = defaultdict(lambda: 0)

    for i in range(input_rows):
        number_hits_per_query[input_table["FileScanUniqueID"][i]] += 1


    for i in range(input_rows):
        spectrum_id = input_table["LibrarySpectrumID"][i]
        score = input_table["MQScore"][i]
        filename = input_table["SpectrumFile"][i]
        libfilename = input_table["LibraryName"][i]
        scan = input_table["#Scan#"][i]
        TIC_Query = input_table["UnstrictEvelopeScore"][i]
        RT_Query = input_table["p-value"][i]
        SpecCharge = input_table["Charge"][i]
        SpecMZ = input_table["SpecMZ"][i]
        MZErrorPPM = input_table["mzErrorPPM"][i]
        SharedPeaks = input_table["LibSearchSharedPeaks"][i]
        MassDiff = input_table["ParentMassDiff"][i]

        print(spectrum_id)
        gnps_library_spectrum = None
        try:
            gnps_library_spectrum = None
            if spectrum_id in spectrum_id_cache:
                gnps_library_spectrum = spectrum_id_cache[spectrum_id]
            else:
                gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id)
                spectrum_id_cache[spectrum_id] = gnps_library_spectrum
        except KeyboardInterrupt:
            raise
        except:
            continue

        gnps_library_spectrum["annotations"] = sorted(gnps_library_spectrum["annotations"], key=lambda annotation: annotation["create_time"], reverse=True)

        output_table["SpectrumID"].append(spectrum_id)
        output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"].replace("\t", ""))
        output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"].replace("\t", ""))
        output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"].replace("\t", ""))
        output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"].replace("\t", ""))
        output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"].replace("\t", ""))
        output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"].replace("\t", ""))
        output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"].replace("\t", ""))
        output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"].replace("\t", ""))
        output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"].replace("\t", ""))
        output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"].replace("\t", ""))
        output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"].replace("\t", ""))
        output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"].replace("\t", ""))
        output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"].replace("\t", ""))
        output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"].replace("\t", ""))
        output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"].replace("\t", ""))
        output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"].replace("\t", ""))
        output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"].replace("\t", ""))

        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD")
            output_table["LibraryQualityString"].append("Gold")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "2":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER")
            output_table["LibraryQualityString"].append("Silver")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "3":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Bronze")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "4":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "5":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        elif gnps_library_spectrum["annotations"][0]["Library_Class"] == "10":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Challenge")
        else:
            print("BULLLSHIT", gnps_library_spectrum["annotations"][0]["Library_Class"])

        output_table["#Scan#"].append(scan)
        output_table["SpectrumFile"].append(filename)
        output_table["LibraryName"].append(libfilename)
        output_table["MQScore"].append(score)
        output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"])
        output_table["TIC_Query"].append(TIC_Query)
        output_table["RT_Query"].append(RT_Query)
        output_table["MZErrorPPM"].append(MZErrorPPM)
        output_table["SharedPeaks"].append(SharedPeaks)
        output_table["MassDiff"].append(MassDiff)
        output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"])
        output_table["SpecMZ"].append(SpecMZ)
        output_table["SpecCharge"].append(SpecCharge)
        output_table["FileScanUniqueID"].append(input_table["FileScanUniqueID"][i])
        output_table["NumberHits"].append(number_hits_per_query[input_table["FileScanUniqueID"][i]])


        tag_list = [ (tag["tag_desc"] + "[" + tag["tag_type"] + "]") for tag in gnps_library_spectrum["spectrum_tags"]]
        tag_string = "||".join(tag_list).replace("\t", "")

        output_table["tags"].append(tag_string)

    ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)
コード例 #52
0
def loading_network(filename, hasHeaders=False):
    node1_list = []
    node2_list = []

    mass_difference = []
    property1 = []
    cosine_score = []
    explained_intensity = []
    edge_annotation = []

    if hasHeaders == True:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(filename)

        if row_count == -1:
            return nx.MultiGraph()

        node1_list = table_data["CLUSTERID1"]
        node2_list = table_data["CLUSTERID2"]

        mass_difference = table_data["DeltaMZ"]
        property1 = table_data["MEH"]
        cosine_score = None
        if "Cosine" in table_data:
            cosine_score = table_data["Cosine"]
        if "COSINE" in table_data:
            cosine_score = table_data["COSINE"]
        explained_intensity = table_data["OtherScore"]

        if len(property1)  != len(node1_list):
            property1 = node1_list
        if len(explained_intensity)  != len(node1_list):
            explained_intensity = node1_list
        if "EdgeAnnotation" in table_data:
            edge_annotation = table_data["EdgeAnnotation"]
        else:
            edge_annotation = [" "] * len(node1_list)

    else:
        row_count, table_data = ming_fileio_library.parse_table_without_headers(filename)

        if row_count == -1:
            return nx.MultiGraph()

        node1_list = table_data[0]
        node2_list = table_data[1]

        mass_difference = table_data[2]
        property1 = table_data[3]
        cosine_score = table_data[4]
        explained_intensity = table_data[5]
        edge_annotation = [" "] * len(node1_list)

    edge_property_map = {}
    edge_object_list = []
    intermediate_graph_nodes = set()
    intermediate_edges_to_add = []
    for i in range(row_count):
        edge_object = {}
        edge_object["node1"] = node1_list[i]
        edge_object["node2"] = node2_list[i]
        edge_object["mass_difference"] = mass_difference[i]
        edge_object["property1"] = property1[i]
        edge_object["cosine_score"] = float(cosine_score[i])
        edge_object["explained_intensity"] = float(explained_intensity[i])
        edge_object["component"] = -1
        edge_object["EdgeType"] = "Cosine"
        edge_object["EdgeAnnotation"] = edge_annotation[i].rstrip()
        edge_object["EdgeScore"] = float(cosine_score[i])

        edge_key = node1_list[i] + "-" + node2_list[i]

        edge_property_map[edge_key] = edge_object

        intermediate_graph_nodes.add(edge_object["node1"])
        intermediate_graph_nodes.add(edge_object["node2"])

        intermediate_edges_to_add.append((edge_object["node1"], edge_object["node2"], edge_object))

    G=nx.MultiGraph()
    G.add_nodes_from(intermediate_graph_nodes)
    G.add_edges_from(intermediate_edges_to_add)

    return G
コード例 #53
0
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    input_spectra_folder = sys.argv[3]
    library_search_results_filename = sys.argv[4]
    output_matches_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_neighbors" : [], "Compound_Name" : [], "SpectrumID" : []}

    try:
       if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
           ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
           exit(0)
    except:
       ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
       exit(0)


    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    dataset_dict = params_map["dataset_dict"]
    all_datasets = params_map["all_datasets"]

    #print(len(all_datasets))
    #print(partition_of_node)
    #print(partition_total)

    #all_datasets = all_datasets[partition_of_node::partition_total]

    all_matches = finding_matches_in_public_data(os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets)

    #Lets parse the search results and then populate this thing with search results
    library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers(library_search_results_filename)
    scan_to_library_map = {}
    for i in range(library_search_result_count):
        scan = library_search_data["Scan"][i]
        scan_to_library_map[scan] = {"Compound_Name" : library_search_data["Compound_Name"][i], "SpectrumID" : library_search_data["SpectrumID"][i]}

    for dataset in all_matches:
        #For each dataset, lets try to find the clustering information
        if len(all_matches[dataset]["matches"]) == 0:
            continue

        most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(dataset_dict[dataset]["task"])
        molecular_network = get_molecular_network_obj(most_recent_molecular_networking_job)

        for match in all_matches[dataset]["matches"]:
            output_map['specs_filename'].append("specs_ms.mgf")
            output_map['specs_scan'].append(match.query_scan)
            output_map['dataset_id'].append(dataset_dict[dataset]["dataset"])
            output_map['dataset_title'].append(dataset_dict[dataset]["title"])
            output_map['dataset_filename'].append(match.filename)
            output_map['dataset_scan'].append(match.scan)
            output_map['score'].append(match.score)

            #List the library identifications
            if str(match.query_scan) in scan_to_library_map:
                output_map['Compound_Name'].append(scan_to_library_map[str(match.query_scan)]["Compound_Name"])
                output_map['SpectrumID'].append(scan_to_library_map[str(match.query_scan)]["SpectrumID"])
            else:
                output_map['Compound_Name'].append("")
                output_map['SpectrumID'].append("")

            #Lets find all the analogs available
            if molecular_network != None:
                neighbors_in_dataset = molecular_network.get_node_neighbors(match.scan)
                output_map['dataset_neighbors'].append(len(neighbors_in_dataset))
            else:
                output_map['dataset_neighbors'].append(0)



    ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
コード例 #54
0
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)



    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
コード例 #55
0
def add_clusterinfo_summary_to_graph(G, cluster_info_summary_filename):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_summary_filename)

    #Setting default metadata for nodes in network
    #for node in G.node:
    #    print(node)

    default_listed_columns = [("precursor mass", "float"), \
    ("charge", "int"), \
    ("parent mass", "float"), \
    ("number of spectra", "int"), \
    ("cluster index", "int"), \
    ("sum(precursor intensity)", "float"), \
    ("RTMean", "float"), \
    ("AllGroups", "string"), ("DefaultGroups", "string"), \
    ("RTConsensus", "float"), ("UniqueFileSources", "string")]

    optional_listed_columns = [("Correlated Features Group ID", "string"), \
    ("Annotated Adduct Features ID", "string"), \
    ("Best Ion", "string"), \
    ("neutral M mass", "float"), \
    ("MS2 Verification Comment", "string"), \
    ("ProteoSAFeClusterLink", "string"), \
    ("GNPSLinkout_Cluster", "string"), \
    ("GNPSLinkout_Network", "string"), ("componentindex", "string")]



    group_columns = ["G1", "G2", "G3", "G4", "G5", "G6"]

    for i in range(row_count):
        cluster_index = table_data["cluster index"][i]

        if cluster_index in G.node:
            for default_column in default_listed_columns:
                key_name = default_column[0]
                type_name = default_column[1]
                try:
                    if type_name == "float":
                        G.node[cluster_index][key_name] = float(table_data[key_name][i])
                    elif type_name == "int":
                        G.node[cluster_index][key_name] = int(table_data[key_name][i])
                    elif type_name == "string":
                        G.node[cluster_index][key_name] = str(table_data[key_name][i])
                except:
                    if type_name == "float":
                        G.node[cluster_index][key_name] = float("0.0")
                    elif type_name == "int":
                        G.node[cluster_index][key_name] = int("0")
                    elif type_name == "string":
                        G.node[cluster_index][key_name] = str("N/A")

            for group_name in group_columns:
                try:
                    G.node[cluster_index][group_name] = float(table_data[group_name][i])
                except:
                    G.node[cluster_index][group_name] = 0.0

            #Looking for all the groups
            for header in table_data:
                if header.find("GNPSGROUP") != -1:
                    try:
                        G.node[cluster_index][header] = int(table_data[header][i])
                    except:
                        try:
                            G.node[cluster_index][header] = float(table_data[header][i])
                        except:
                            G.node[cluster_index][header] = -1

            #Looking for all Attributes
            for header in table_data:
                if header.find("ATTRIBUTE_") != -1:
                    try:
                        G.node[cluster_index][header] = table_data[header][i]
                    except:
                        G.node[cluster_index][header] = ""

            #Looking for optional columns
            for optional_column in optional_listed_columns:
                key_name = optional_column[0]
                type_name = optional_column[1]

                if key_name in table_data:
                    try:
                        if type_name == "float":
                            G.node[cluster_index][key_name] = float(table_data[key_name][i])
                        elif type_name == "int":
                            G.node[cluster_index][key_name] = int(table_data[key_name][i])
                        elif type_name == "string":
                            G.node[cluster_index][key_name] = str(table_data[key_name][i])
                    except:
                        if type_name == "float":
                            G.node[cluster_index][key_name] = float("0.0")
                        elif type_name == "int":
                            G.node[cluster_index][key_name] = int("0")
                        elif type_name == "string":
                            G.node[cluster_index][key_name] = str("N/A")
コード例 #56
0
def main():
    parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('groupmapping_folder', help='groupmapping_folder')
    parser.add_argument('attributemapping_folder', help='attributemapping_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_groupmapping_file', help='output_groupmapping_file')
    parser.add_argument('output_attributemapping_file', help='output_attributemapping_file')
    parser.add_argument('inputspectrafolder', help='inputspectrafolder')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))
    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)
    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj)
    file_path_prefix = args.inputspectrafolder

    output_group_file = open(args.output_groupmapping_file, "w")
    output_attribute_file = open(args.output_attributemapping_file, "w")

    """
    Writing Default Grouping to output file
    """
    default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] }
    for mangled_name in mangled_file_mapping.keys():
        if mangled_name.find("spec-") != -1:
            default_groupings['G1'].append(mangled_name.rstrip())
        if mangled_name.find("spectwo-") != -1:
            default_groupings['G2'].append(mangled_name.rstrip())
        if mangled_name.find("specthree-") != -1:
            default_groupings['G3'].append(mangled_name.rstrip())
        if mangled_name.find("specfour-") != -1:
            default_groupings['G4'].append(mangled_name.rstrip())
        if mangled_name.find("specfive-") != -1:
            default_groupings['G5'].append(mangled_name.rstrip())
        if mangled_name.find("specsix-") != -1:
            default_groupings['G6'].append(mangled_name.rstrip())

    for default_group_key in default_groupings.keys():
        default_group_string = ""
        default_group_string += "GROUP_" + default_group_key +"="
        for mangled_name in default_groupings[default_group_key]:
            default_group_string += os.path.join(file_path_prefix, mangled_name) + ";"
        if len(default_groupings[default_group_key]) > 0:
            default_group_string = default_group_string[:-1]
        output_group_file.write(default_group_string + "\n")


    """Determining output whether to use group mapping file or metadata file"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder)
    attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder)

    if len(metadata_files_in_folder) > 1:
        print("Too many metafile inputted")
        exit(1)
    if len(metadata_files_in_folder) == 1:
        #Using metadatat file
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

        if not "filename" in table_data:
            print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename")
            exit(1)

        attributes_to_groups_mapping = defaultdict(set)
        group_to_files_mapping = defaultdict(list)
        for i in range(row_count):
            filename = table_data["filename"][i]
            basename_filename = os.path.basename(filename).rstrip()
            if basename_filename in reverse_file_mangling:
                mangled_name = reverse_file_mangling[basename_filename]
                for key in table_data:
                    if key.find("ATTRIBUTE_") != -1:
                        group_name = table_data[key][i]
                        if len(group_name) < 1:
                            continue
                        group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name))
                        attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name)
            else:
                #Filename is not part of sample set
                continue

        for group_name in group_to_files_mapping:
            group_string = "GROUP_" + group_name + "="  + ";".join(group_to_files_mapping[group_name])
            output_group_file.write(group_string + "\n")

        for attribute_name in attributes_to_groups_mapping:
            attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name]))
            output_attribute_file.write(attribute_string + "\n")
        exit(0)

    """Falling back on old group mapping file"""
    if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1:
        print("Too many group/attribute mappings inputted")
        exit(1)

    if len(groupmapping_files_in_folder) == 1:
        for line in open(groupmapping_files_in_folder[0], errors='ignore'):
            splits = line.rstrip().split("=")
            if len(splits) < 2:
                continue

            group_name = splits[0]
            group_files = []
            for filename in splits[1].split(";"):
                if os.path.basename(filename) in reverse_file_mangling:
                    mangled_name = reverse_file_mangling[os.path.basename(filename)]
                    group_files.append(os.path.join(file_path_prefix, mangled_name))

            group_string = group_name + "=" + ";".join(group_files)
            output_group_file.write(group_string + "\n")

    if len(attributemapping_files_in_folder) == 1:
        for line in open(attributemapping_files_in_folder[0]):
            output_attribute_file.write(line)
コード例 #57
0
def main():
    input_result_filename = sys.argv[1]
    output_result_filename = sys.argv[2]


    input_rows, input_table = ming_fileio_library.parse_table_with_headers(input_result_filename)

    output_table = defaultdict(list)

    output_headers = ["SpectrumID", "Compound_Name", "Ion_Source", "Instrument", "Compound_Source", "PI", "Data_Collector", "Adduct"]
    output_headers += ["Precursor_MZ", "ExactMass", "Charge", "CAS_Number", "Pubmed_ID", "Smiles", "INCHI", "INCHI_AUX", "Library_Class"]
    output_headers += ["IonMode", "UpdateWorkflowName", "LibraryQualityString", "#Scan#", "SpectrumFile", "MQScore", "Organism"]
    output_headers += ["TIC_Query", "RT_Query", "MZErrorPPM", "SharedPeaks", "MassDiff", "LibMZ", "SpecMZ", "SpecCharge"]

    for header in output_headers:
        output_table[header] = []

    for i in range(input_rows):
        spectrum_id = input_table["LibrarySpectrumID"][i]
        score = input_table["MQScore"][i]
        filename = input_table["SpectrumFile"][i]
        libfilename = input_table["LibraryName"][i]
        scan = input_table["#Scan#"][i]
        TIC_Query = input_table["UnstrictEvelopeScore"][i]
        RT_Query = input_table["p-value"][i]
        SpecCharge = input_table["Charge"][i]
        SpecMZ = input_table["SpecMZ"][i]
        MZErrorPPM = input_table["mzErrorPPM"][i]
        SharedPeaks = input_table["LibSearchSharedPeaks"][i]
        MassDiff = input_table["ParentMassDiff"][i]

        print(spectrum_id)
        gnps_library_spectrum = None
        try:
            gnps_library_spectrum = ming_gnps_library.get_library_spectrum(spectrum_id)
        except KeyboardInterrupt:
            raise
        except:
            continue

        output_table["SpectrumID"].append(spectrum_id)
        output_table["Compound_Name"].append(gnps_library_spectrum["annotations"][0]["Compound_Name"])
        output_table["Ion_Source"].append(gnps_library_spectrum["annotations"][0]["Ion_Source"])
        output_table["Instrument"].append(gnps_library_spectrum["annotations"][0]["Instrument"])
        output_table["Compound_Source"].append(gnps_library_spectrum["annotations"][0]["Compound_Source"])
        output_table["PI"].append(gnps_library_spectrum["annotations"][0]["PI"])
        output_table["Data_Collector"].append(gnps_library_spectrum["annotations"][0]["Data_Collector"])
        output_table["Adduct"].append(gnps_library_spectrum["annotations"][0]["Adduct"])
        output_table["Precursor_MZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"])
        output_table["ExactMass"].append(gnps_library_spectrum["annotations"][0]["ExactMass"])
        output_table["Charge"].append(gnps_library_spectrum["annotations"][0]["Charge"])
        output_table["CAS_Number"].append(gnps_library_spectrum["annotations"][0]["CAS_Number"])
        output_table["Pubmed_ID"].append(gnps_library_spectrum["annotations"][0]["Pubmed_ID"])
        output_table["Smiles"].append(gnps_library_spectrum["annotations"][0]["Smiles"])
        output_table["INCHI"].append(gnps_library_spectrum["annotations"][0]["INCHI"])
        output_table["INCHI_AUX"].append(gnps_library_spectrum["annotations"][0]["INCHI_AUX"])
        output_table["Library_Class"].append(gnps_library_spectrum["annotations"][0]["Library_Class"])
        output_table["IonMode"].append(gnps_library_spectrum["annotations"][0]["Ion_Mode"])

        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "1":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-GOLD")
            output_table["LibraryQualityString"].append("Gold")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "2":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-SILVER")
            output_table["LibraryQualityString"].append("Silver")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "3":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Bronze")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "4":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Insilico")
        if gnps_library_spectrum["annotations"][0]["Library_Class"] == "10":
            output_table["UpdateWorkflowName"].append("UPDATE-SINGLE-ANNOTATED-BRONZE")
            output_table["LibraryQualityString"].append("Challenge")

        output_table["#Scan#"].append(scan)
        output_table["SpectrumFile"].append(filename)
        output_table["LibraryName"].append(libfilename)
        output_table["MQScore"].append(score)
        output_table["Organism"].append(gnps_library_spectrum["spectruminfo"]["library_membership"])
        output_table["TIC_Query"].append(TIC_Query)
        output_table["RT_Query"].append(RT_Query)
        output_table["MZErrorPPM"].append(MZErrorPPM)
        output_table["SharedPeaks"].append(SharedPeaks)
        output_table["MassDiff"].append(MassDiff)
        output_table["LibMZ"].append(gnps_library_spectrum["annotations"][0]["Precursor_MZ"])
        output_table["SpecMZ"].append(SpecMZ)
        output_table["SpecCharge"].append(SpecCharge)

        tag_string = ""
        for tag in gnps_library_spectrum["spectrum_tags"]:
            tag_string += tag["tag_desc"].replace("\t", "") + "||"

        if len(tag_string) > 3:
            tag_string = tag_string[:-2]


        output_table["tags"].append(tag_string)

    ming_fileio_library.write_dictionary_table_data(output_table, output_result_filename)