Esempio n. 1
0
def main():
    parallel_json = json.loads(open(sys.argv[1]).read())
    params_filename = sys.argv[2]
    task_id_file = sys.argv[3]
    output_peptide_folder = sys.argv[4]
    output_psm_folder = sys.argv[5]
    #output_summary = sys.argv[5]
    params_dict = ming_proteosafe_library.parse_xml_file(open(params_filename))

    source_tasks_text = params_dict["tasks_to_consolidate"][0]

    row_count, task_file_table = ming_fileio_library.parse_table_with_headers(
        task_id_file)

    my_node = parallel_json["node_partition"]
    total_node = parallel_json["total_paritions"]

    output_summary = os.path.join(sys.argv[6], str(my_node))

    if len(source_tasks_text) > 0:
        source_tasks_list = json.loads(source_tasks_text)
        source_tasks_list += task_file_table["TASKID"]
        source_tasks_list.sort()
        source_tasks_list = source_tasks_list[my_node::total_node]
        grab_all_results(source_tasks_list, output_peptide_folder,
                         output_psm_folder, output_summary, params_dict)
    else:
        open(output_summary, "w").write("None")
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('networking_pairs_results_file', help='networking_pairs_results_file')
    parser.add_argument('networking_pairs_results_file_filtered', help='networking_pairs_results_file_filtered')
    parser.add_argument('networking_pairs_results_file_filtered_classic_output', help='networking_pairs_results_file_filtered_classic_output')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))

    top_k_val = 10
    max_component_size = 0

    if "TOPK" in param_obj:
        top_k_val = int(param_obj["TOPK"][0])

    if "MAXIMUM_COMPONENT_SIZE" in param_obj:
        max_component_size = int(param_obj["MAXIMUM_COMPONENT_SIZE"][0])

    G = molecular_network_filtering_library.loading_network(args.networking_pairs_results_file, hasHeaders=True)
    if G == None:
        exit(0)

    molecular_network_filtering_library.filter_top_k(G, top_k_val)
    molecular_network_filtering_library.filter_component(G, max_component_size)
    molecular_network_filtering_library.output_graph_with_headers(G, args.networking_pairs_results_file_filtered)

    molecular_network_filtering_library.output_graph(G, args.networking_pairs_results_file_filtered_classic_output)
Esempio n. 3
0
def main():
    paramxml_input_filename = sys.argv[1]
    output_json_folder = sys.argv[2]
    parallelism = int(sys.argv[3])

    params_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_input_filename))

    try:
        if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
            parallelism = 1
    except:
        parallelism = 1

    #dataset_dict = ming_proteosafe_library.get_all_dataset_dict()
    all_datasets = ming_proteosafe_library.get_all_datasets()

    for i in range(parallelism):
        output_map = {"node_partition": i, "total_paritions": parallelism}
        partitioned_datasets = all_datasets[i::parallelism]
        output_map["all_datasets"] = partitioned_datasets

        dataset_map = {}
        for dataset in partitioned_datasets:
            dataset_map[dataset["dataset"]] = dataset

        output_map["dataset_dict"] = dataset_map
        output_filename = os.path.join(output_json_folder, str(i) + ".json")
        open(output_filename, "w").write(json.dumps(output_map))
Esempio n. 4
0
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
Esempio n. 5
0
def main():
    usage()

    output_file_path = sys.argv[2]
    input_file_path = sys.argv[4]
    params_file_path = sys.argv[5]
    top_k_val = 10
    max_component_size = 0

    params = ming_proteosafe_library.parse_xml_file(open(
        params_file_path, "r"))

    if "TOPK" in params:
        top_k_val = int(params["TOPK"][0])

    if "MAXIMUM_COMPONENT_SIZE" in params:
        max_component_size = int(params["MAXIMUM_COMPONENT_SIZE"][0])

    #Doing other filtering
    G = molecular_network_filtering_library.loading_network(input_file_path,
                                                            hasHeaders=True)
    #Returning None means that there are no edges in the output
    if G == None:
        exit(0)
    molecular_network_filtering_library.filter_top_k(G, top_k_val)
    molecular_network_filtering_library.filter_component(G, max_component_size)
    molecular_network_filtering_library.output_graph(G, output_file_path)
def main():
    paramxml_input_filename = sys.argv[1]
    output_json_folder = sys.argv[2]
    parallelism = int(sys.argv[3])

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    try:
       if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
           parallelism = 1
    except:
       parallelism = 1

    #dataset_dict = ming_proteosafe_library.get_all_dataset_dict()
    all_datasets = ming_proteosafe_library.get_all_datasets()

    for i in range(parallelism):
        output_map = {"node_partition" : i, "total_paritions" : parallelism}
        partitioned_datasets = all_datasets[i::parallelism]
        output_map["all_datasets"] = partitioned_datasets

        dataset_map = {}
        for dataset in partitioned_datasets:
            dataset_map[dataset["dataset"]] = dataset

        output_map["dataset_dict"] = dataset_map
        output_filename = os.path.join(output_json_folder, str(i) + ".json")
        open(output_filename, "w").write(json.dumps(output_map))
Esempio n. 7
0
def main():
    usage()

    output_file_path = sys.argv[2]
    input_file_path = sys.argv[4]
    params_file_path = sys.argv[5]
    top_k_val = 10
    max_component_size = 0

    params = ming_proteosafe_library.parse_xml_file(open(params_file_path, "r"))

    if "TOPK" in params:
        top_k_val = int(params["TOPK"][0])

    if "MAXIMUM_COMPONENT_SIZE" in params:
        max_component_size = int(params["MAXIMUM_COMPONENT_SIZE"][0])

    #Doing other filtering
    G = molecular_network_filtering_library.loading_network(input_file_path, hasHeaders=True)
    #Returning None means that there are no edges in the output
    if G == None:
        exit(0)
    molecular_network_filtering_library.filter_top_k(G, top_k_val)
    molecular_network_filtering_library.filter_component(G, max_component_size)
    molecular_network_filtering_library.output_graph(G, output_file_path)
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)


    clusters_in_network = set()
    for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'):
        clusters_in_network.add(row["cluster index"])

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if not(cluster_number in clusters_in_network):
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0)
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header_list = []
    output_header_list.append("#OTU ID")
    for header in mangled_mapping.keys():
        if header.find("spec") == -1:
            continue
        if os.path.basename(mangled_mapping[header]) in metadata_mapping:
            output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])])
        else:
            output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header])))

    output_file.write("\t".join(output_header_list) + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_output_list = []
        line_output_list.append(str(cluster_idx))
        #line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            if header.find("spec") == -1:
                continue
            line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header]))
            #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        #output_file.write(line_string + "\n")
        output_file.write("\t".join(line_output_list) + "\n")
    output_file.close()
def main():
    input_param = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))
    input_folder = sys.argv[2]
    output_file = sys.argv[3]
    scratch_folder = sys.argv[4]
    path_to_executable = sys.argv[5]
    path_to_isotopes_table = sys.argv[6]

    #parent_mass_tolerance = input_param[]
    parent_mass_tolerance = 0.05

    all_input_file_paths = ming_fileio_library.list_files_in_dir(input_folder)

    output_kl_intermediates = []
    for input_file in all_input_file_paths:
        output_kl_file = os.path.join(scratch_folder,
                                      os.path.basename(input_file) + ".kl")
        cmd = path_to_executable + " --input " + input_file + " --output_summary " + output_kl_file + " " + "--peak_tolerance " + str(
            parent_mass_tolerance
        ) + " --isotope_file " + path_to_isotopes_table + "  >/dev/null 2>&1 "
        print(cmd)
        os.system(cmd)
        #subprocess.call([cmd])
        output_kl_intermediates.append(output_kl_file)

    combined_table = defaultdict(list)
    for output_kl_file in output_kl_intermediates:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            output_kl_file)
        for key in table_data:
            combined_table[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(combined_table,
                                                    output_file)
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename,
                                   clusterinfosummary_filename,
                                   output_filename):
    param_object = ming_proteosafe_library.parse_xml_file(
        open(param_filename, "r"))
    output_file = open(output_filename, "w")
    if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1":
        output_file.write("No Output")
        return

    test_network = molecular_network_library.MolecularNetwork()
    test_network.load_clustersummary(clusterinfosummary_filename)

    line_counts, table_data = ming_fileio_library.parse_table_with_headers(
        cluster_info_filename)

    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if test_network.get_cluster_index(cluster_number) == None:
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][
            mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {
            "filename": table_data["#Filename"][i],
            "intensity": table_data["#PrecIntensity"][i]
        }
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    output_header = "#OTU ID\t"
    for header in mangled_mapping.keys():
        output_header += os.path.basename(mangled_mapping[header]) + "\t"

    output_file.write(output_header + "\n")

    for cluster_idx in cluster_index_to_file_map:
        line_string = str(cluster_idx) + "\t"
        for header in mangled_mapping.keys():
            line_string += str(
                cluster_index_to_file_map[cluster_idx][header]) + "\t"

        #print line_string
        output_file.write(line_string + "\n")
def main():
    params = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))
    proteome = ming_protein_library.parse_fasta_proteome_file(sys.argv[2])

    row_count, table_data = ming_fileio_library.parse_table_with_headers(sys.argv[3])
    decoy_marker = sys.argv[5]

    add_decoy_to_results(table_data, row_count, decoy_marker)
    psm_results = add_fdr_to_results(table_data, row_count)

    output_table = defaultdict(list)

    #Performing filters
    filter_type = params["filter.filter"][0]
    if filter_type == "FDR":
        fdr_threshold = float(params["FDR.FDR"][0])
        for psm in psm_results:
            if psm["QValue"] < fdr_threshold:
                for key in psm:
                    output_table[key].append(psm[key])
    if filter_type == "PepFDR":
        fdr_threshold = float(params["PepFDR.PepFDR"][0])
        for psm in psm_results:
            if psm["PepQValue"] < fdr_threshold and psm["QValue"] < fdr_threshold:
                for key in psm:
                    output_table[key].append(psm[key])
    if filter_type == "FPR":
        print("Lets do nothing, don't know what this is")

    ming_fileio_library.write_dictionary_table_data(output_table, sys.argv[4])
 def load_parameters_file(self, paramsfilename):
     #Loading the file mapping
     parameters = ming_proteosafe_library.parse_xml_file(
         open(paramsfilename, "r"))
     mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
         parameters)
     self.mangled_mapping = mangled_mapping
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('mgf_filename', help='Input mgf file to network')
    parser.add_argument('workflow_parameters',
                        help='proteosafe xml parameters')
    parser.add_argument('parameters_output_folder',
                        help='output folder for parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))

    #Determing number of spectra in mgf file
    number_of_spectra, number_real_spectra = number_scans_in_mgf_file(
        args.mgf_filename)

    parallelism = args.parallelism
    if parallelism > number_of_spectra:
        parallelism = 1

    recommended_parallelism = max(1, int(number_real_spectra / 1000))

    print("recommended_parallelism", recommended_parallelism)

    parallelism = min(recommended_parallelism, parallelism)

    number_per_partition = int(number_of_spectra / parallelism)

    for i in range(parallelism):
        output_parameter_file = open(
            os.path.join(args.parameters_output_folder,
                         str(i) + ".params"), "w")
        output_parameter_file.write("ALIGNS_FORMAT=%s\n" % ("tsv"))
        output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" %
                                    (params_object["MIN_MATCHED_PEAKS"][0]))
        output_parameter_file.write(
            "TOLERANCE_PEAK=%s\n" %
            (params_object["tolerance.Ion_tolerance"][0]))
        output_parameter_file.write(
            "TOLERANCE_PM=%s\n" % (params_object["tolerance.PM_tolerance"][0]))
        output_parameter_file.write("PAIRS_MIN_COSINE=%s\n" %
                                    (params_object["PAIRS_MIN_COSINE"][0]))
        #output_parameter_file.write("MAX_SHIFT=%s\n" % (params_object["MAX_SHIFT"][0]))
        output_parameter_file.write("MAX_SHIFT=%s\n" % ("9999"))
        output_parameter_file.write("MIN_RATIO=%s\n" % ("0.4"))
        output_parameter_file.write("INPUT_SPECTRA_MS2=%s\n" %
                                    (args.mgf_filename))

        start_idx = number_per_partition * i
        end_idx = number_per_partition * (i + 1) - 1
        if i == parallelism - 1:
            end_idx = number_of_spectra

        output_parameter_file.write("IDX_START=%d\n" % (start_idx))
        output_parameter_file.write("IDX_END=%d\n" % (end_idx))
def main():
    param_filename = sys.argv[1]
    metadata_folder = sys.argv[2]
    input_clusterinfo_file = sys.argv[3]
    input_clusterinfosummary = sys.argv[4]
    ili_stl_model_folder = sys.argv[5]
    output_ili_filename = sys.argv[6]
    view_ili_html_filename = sys.argv[7]

    create_output = True
    param_object = ming_proteosafe_library.parse_xml_file(
        open(param_filename, "r"))
    try:
        if param_object["CREATE_ILI_OUTPUT"][0] != "1":
            create_output = False
    except:
        create_output = False

    if create_output:
        ili_stl_model_files_in_folder = ming_fileio_library.list_files_in_dir(
            ili_stl_model_folder)
        metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
            metadata_folder)
        if len(metadata_files_in_folder) != 1:
            print(
                "Metadata file not provided, cannot create ili compatible output without coordinates"
            )
            exit(1)
        filename_coordinate_mapping = load_filename_to_coordinate_mapping(
            metadata_files_in_folder[0])
        create_ili_output_from_clusterinfo(input_clusterinfo_file,
                                           param_filename,
                                           input_clusterinfosummary,
                                           filename_coordinate_mapping,
                                           output_ili_filename)

        if len(ili_stl_model_files_in_folder) == 1:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("<script>\n")
            output_ili_html_file.write(
                'window.location.replace("https://ili.embl.de/?https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_stl_model/ili_stl_model-00000.stl;https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_output/ili_quant.csv")\n'
                % (param_object["task"][0], param_object["task"][0]))
            output_ili_html_file.write("</script>\n")
            output_ili_html_file.close()

        if len(ili_stl_model_files_in_folder) == 0:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write(
                "No STL file uploaded, cannot directly link to ili\n")
            output_ili_html_file.close()

        if len(ili_stl_model_files_in_folder) > 1:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("Too many stl files uploaded\n")
            output_ili_html_file.close()
    else:
        open(output_ili_filename, "w").write("No Output")
        open(view_ili_html_filename, "w").write(
            "ili output was not selected or no metadata file was provided")
Esempio n. 15
0
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename):
    output_file = open(output_filename, "w")
    test_network = molecular_network_library.MolecularNetwork()
    test_network.load_clustersummary(clusterinfosummary_filename)
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]
        if test_network.get_cluster_index(cluster_number) == None:
            continue

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    all_headers = ["filename", "X", "Y", "Z", "radius"]
    for cluster_idx in cluster_index_to_file_map:
        all_headers.append(cluster_idx)

    #writing header
    output_file.write(",".join(all_headers) + "\n")

    for sample_name in mangled_mapping:
        if sample_name.find("spec") == -1:
            continue
        real_filename = mangled_mapping[sample_name]

        if not os.path.basename(real_filename) in filename_coordinate_mapping:
            continue

        line_output = [real_filename]
        coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)]
        line_output.append(coordinate_object["x"])
        line_output.append(coordinate_object["y"])
        line_output.append(coordinate_object["z"])
        line_output.append(coordinate_object["radius"])
        print(line_output, coordinate_object)
        for cluster_idx in cluster_index_to_file_map:
            line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name]))
        output_file.write(",".join(line_output) + "\n")

    output_file.close()
Esempio n. 16
0
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('workflow_parameters',
                        help='proteosafe xml parameters')
    parser.add_argument('input_mgf', help='Input mgf file to network')
    parser.add_argument('library_folder', help='library_folder')
    parser.add_argument('library_matches', help='output matches')
    parser.add_argument('binary_path', help='binary_path')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))

    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)

    temp_parameters_file = "temp_parameters" + ".params"

    output_parameter_file = open(temp_parameters_file, "w")
    #Search Criteria

    output_parameter_file.write("SCORE_THRESHOLD=%s\n" %
                                (params_object["SCORE_THRESHOLD"][0]))
    output_parameter_file.write("MIN_MATCHED_PEAKS_SEARCH=%s\n" %
                                (params_object["MIN_MATCHED_PEAKS"][0]))
    output_parameter_file.write("TOP_K_RESULTS=%s\n" %
                                (params_object["TOP_K_RESULTS"][0]))
    output_parameter_file.write("search_peak_tolerance=%s\n" %
                                (params_object["tolerance.Ion_tolerance"][0]))
    output_parameter_file.write("search_parentmass_tolerance=%s\n" %
                                (params_object["tolerance.PM_tolerance"][0]))
    output_parameter_file.write("ANALOG_SEARCH=%s\n" %
                                (params_object["ANALOG_SEARCH"][0]))
    output_parameter_file.write("MAX_SHIFT_MASS=%s\n" %
                                (params_object["MAX_SHIFT_MASS"][0]))

    #Filtering Criteria
    output_parameter_file.write("FILTER_PRECURSOR_WINDOW=%s\n" %
                                (params_object["FILTER_PRECURSOR_WINDOW"][0]))
    output_parameter_file.write("MIN_PEAK_INT=%s\n" %
                                (params_object["MIN_PEAK_INT"][0]))
    output_parameter_file.write("WINDOW_FILTER=%s\n" %
                                (params_object["WINDOW_FILTER"][0]))
    output_parameter_file.write("FILTER_LIBRARY=%s\n" %
                                (params_object["FILTER_LIBRARY"][0]))

    output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" %
                                (" ".join(library_files)))

    output_parameter_file.write("RESULTS_DIR=%s\n" % (args.library_matches))
    output_parameter_file.write("searchspectra=%s\n" % (args.input_mgf))

    output_parameter_file.close()

    cmd = "%s ExecSpectralLibrarySearchMolecular %s -ll 0" % (
        args.binary_path, temp_parameters_file)
    os.system(cmd)
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename):
    output_file = open(output_filename, "w")
    line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename)
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object)

    cluster_index_to_file_map = {}

    clusters_map = {}
    all_files = {}
    for i in range(line_counts):
        cluster_number = table_data["#ClusterIdx"][i]

        if not (cluster_number in clusters_map):
            clusters_map[cluster_number] = []
            cluster_index_to_file_map[cluster_number] = {}
            #Adding all file names to mapping
            for mangled_name in mangled_mapping.keys():
                cluster_index_to_file_map[cluster_number][mangled_name] = 0.0

        #print table_data["#Filename"][i].split("/")[1]
        mangled_filename_only = os.path.basename(table_data["#Filename"][i])
        cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i])
        spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]}
        all_files[table_data["#Filename"][i]] = 1
        clusters_map[cluster_number].append(spectrum_info)

    all_headers = ["filename", "X", "Y", "Z", "radius"]
    for cluster_idx in cluster_index_to_file_map:
        all_headers.append(cluster_idx)

    #writing header
    output_file.write(",".join(all_headers) + "\n")

    for sample_name in mangled_mapping:
        if sample_name.find("spec") == -1:
            continue
        real_filename = mangled_mapping[sample_name]

        if not os.path.basename(real_filename) in filename_coordinate_mapping:
            continue

        line_output = [real_filename]
        coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)]
        line_output.append(coordinate_object["x"])
        line_output.append(coordinate_object["y"])
        line_output.append(coordinate_object["z"])
        line_output.append(coordinate_object["radius"])
        print(line_output, coordinate_object)
        for cluster_idx in cluster_index_to_file_map:
            line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name]))
        output_file.write(",".join(line_output) + "\n")

    output_file.close()
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary')
    parser.add_argument('input_network_pairs_file', help='network_pairs_file')
    parser.add_argument('input_library_search_file', help='network_pairs_file')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    parser.add_argument('output_component_summary', help='output component file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfo_summary)

    library_ids_dict = load_library_id_dict(args.input_library_search_file)
    nodes_to_component, component_to_nodes = load_pairs_dict(args.input_network_pairs_file)

    for cluster in all_clusterinfo_list:
        cluster_index = cluster["cluster index"]
        if cluster_index in nodes_to_component:
            cluster["componentindex"] = nodes_to_component[cluster_index]
            cluster["GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s" % (nodes_to_component[cluster_index], param_obj["task"][0])
        else:
            cluster["componentindex"] = "-1"
            cluster["GNPSLinkout_Network"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (param_obj["task"][0], cluster_index, cluster_index)

        if cluster_index in library_ids_dict:
            cluster["LibraryID"] = library_ids_dict[cluster_index]["Compound_Name"]
            cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"]
            cluster["SpectrumID"] = library_ids_dict[cluster_index]["SpectrumID"]
        else:
            cluster["LibraryID"] = "N/A"
            cluster["MQScore"] = "N/A"
            cluster["SpectrumID"] = "N/A"

    ming_fileio_library.write_list_dict_table_data(all_clusterinfo_list, args.output_clusterinfo_summary)

    output_component_list = []

    for componentindex in component_to_nodes:
        output_dict = {}
        output_dict["ComponentIndex"] = componentindex
        output_dict["NodeCount"] = len(component_to_nodes[componentindex])
        output_dict["#Spectra"] = len(component_to_nodes[componentindex])
        all_lib_identifications = []
        for node in component_to_nodes[componentindex]:
            if node in library_ids_dict:
                all_lib_identifications.append(library_ids_dict[node]["Compound_Name"])
        output_dict["AllIDs"] = "!".join(all_lib_identifications)
        output_component_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_component_list, args.output_component_summary)
def main():
    paramxml_input_filename = sys.argv[1]
    output_mgf_file = sys.argv[2]

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    #Validating the spectrum string
    if masst_validator.validate(params_obj["spectrum_string"][0], int(params_obj["MIN_MATCHED_PEAKS"][0])) != 0:
        print("Validation Error on Input")
        exit(1)

    spectrum_collection = get_spectrum_collection_from_param_obj(params_obj)
    spectrum_collection.save_to_mgf(open(output_mgf_file, "w"))
def main():
    parser = argparse.ArgumentParser(description='Invoking new workflow with parameters of given workflow')
    parser.add_argument('workflowparamters', help='workflowparamters')
    parser.add_argument('credentials', help='credentials.json')
    parser.add_argument('outputhtml', default='output.html', help='output html with a url')
    parser.add_argument('--serverurl', default='proteomics2.ucsd.edu', help='Server URL, default is proteomics2.ucsd.edu, other options are massive.ucsd.edu and gnps.ucsd.edu')
    parser.add_argument('--parametermapping', action='append', help='mapping of current workflow parameters to new parameters in the format: <old parameter>:<new parameter>')
    parser.add_argument('--newparameters', action='append', help='parameter key: <param name>:<parameter value>')
    parser.add_argument('--runparameter', default='NONE', help='Workflow xml parameter to check if this parameter is equal to "1" to actually invoke the workflow')
    args = parser.parse_args()

    credentials = json.loads(open(args.credentials).read())

    workflow_parameters_map = ming_proteosafe_library.parse_xml_file(open(args.workflowparamters))

    if args.runparameter != "NONE":
        if workflow_parameters_map[args.runparameter][0] == "0":
            output_html_file = open(args.outputhtml, "w")
            output_html_file.write("User chose not to run tool\n")
            output_html_file.close()
            exit(0)

    new_parameters = {}

    new_parameters["desc"] = "Analysis subroutine from ProteoSAFe job %s" % (workflow_parameters_map["task"][0])

    if args.newparameters != None:
        for parameter_string in args.newparameters:
            parameter_key = parameter_string.split(":")[0]
            parameter_value = parameter_string.split(":")[1]

            new_parameters[parameter_key] = parameter_value

    if args.parametermapping != None:
        for parameter_string in args.parametermapping:
            parameter_old_key = parameter_string.split(":")[0]
            parameter_new_key = parameter_string.split(":")[1]

            new_parameters[parameter_new_key] = workflow_parameters_map[parameter_old_key][0]

    task_id = ming_proteosafe_library.invoke_workflow(args.serverurl, new_parameters, credentials["username"], credentials["password"])
    if task_id == None:
        exit(1)
    ming_proteosafe_library.wait_for_workflow_finish(args.serverurl, task_id)

    """Writing HTML output"""
    output_html_file = open(args.outputhtml, "w")
    output_html_file.write("<script>\n")
    output_html_file.write('window.open("https://%s/ProteoSAFe/status.jsp?task=%s", "_blank")\n' % (args.serverurl, task_id))
    output_html_file.write("</script>\n")
    output_html_file.close()
Esempio n. 21
0
def main():
    parser = argparse.ArgumentParser(
        description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clustersummary', help='input_clustersummary')
    parser.add_argument('input_clusterinfo', help='input_clusterinfo')
    parser.add_argument('output_clusterinfo', help='output_clusterinfo')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(open(
        args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)

    #Creating acceptable clusters to include in cluster info
    included_clusters = set()
    for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'):
        included_clusters.add(row["cluster index"])

    with open(args.input_clusterinfo) as input_clusterinfo:
        field_names = [
            "cluster index", "AllFiles", "sum(precursor intensity)", "RTMean",
            "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath",
            "Original_Path"
        ]
        output_clusterinfo_writer = csv.DictWriter(open(
            args.output_clusterinfo, "w"),
                                                   fieldnames=field_names,
                                                   delimiter='\t')
        output_clusterinfo_writer.writeheader()

        input_clusterinfo_reader = csv.DictReader(input_clusterinfo,
                                                  delimiter='\t')
        for row in input_clusterinfo_reader:
            if not (row["#ClusterIdx"] in included_clusters):
                continue
            output_dict = {}
            output_dict["cluster index"] = row["#ClusterIdx"]
            output_dict["AllFiles"] = row["#Filename"]
            output_dict["sum(precursor intensity)"] = row["#PrecIntensity"]
            output_dict["RTMean"] = row["#RetTime"]
            output_dict["RTStdErr"] = "0"
            output_dict["parent mass"] = row["#ParentMass"]
            output_dict["ScanNumber"] = row["#Scan"]
            output_dict["ProteosafeFilePath"] = os.path.join(
                "spec", os.path.basename(row["#Filename"]))
            output_dict["Original_Path"] = "f." + mangled_mapping[
                os.path.basename(row["#Filename"])]
            output_clusterinfo_writer.writerow(output_dict)

    exit(0)
def main():
    paramxml_filename = sys.argv[1]
    psms_input_file = sys.argv[2]
    kl_input_file = sys.argv[3]
    output_psms_file = sys.argv[4]

    parameters_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_filename))

    row_count, kl_data = ming_fileio_library.parse_table_with_headers(
        kl_input_file)
    kl_dict = {}
    for i in range(row_count):
        filename = os.path.basename(kl_data["Filename"][i])
        scan = kl_data["Scan"][i]
        kl_strict = (kl_data["KL Strict"][i])
        kl_unstrict = (kl_data["KL"][i])
        interpeak_intensity = (kl_data["Interpeak intensity"][i])
        key = filename + ":" + str(scan)
        kl_dict[key] = {
            "kl_strict": kl_strict,
            "kl_unstrict": kl_unstrict,
            "kl_interpeak": interpeak_intensity
        }

    #Since we don't support more fields in the psm object, we're going to read this file in again as a tsv file and add the columns as necessary
    psm_rows, psm_table_data = ming_fileio_library.parse_table_with_headers(
        psms_input_file)
    psm_table_data["kl_strict"] = []
    psm_table_data["kl_unstrict"] = []
    psm_table_data["kl_interpeak"] = []
    for i in range(psm_rows):
        key = psm_table_data["filename"][i] + ":" + psm_table_data["scan"][i]
        if key in kl_dict:
            psm_table_data["kl_strict"].append(kl_dict[key]["kl_strict"])
            psm_table_data["kl_unstrict"].append(kl_dict[key]["kl_unstrict"])
            psm_table_data["kl_interpeak"].append(kl_dict[key]["kl_interpeak"])
        else:
            psm_table_data["kl_strict"].append(-1)
            psm_table_data["kl_unstrict"].append(-1)
            psm_table_data["kl_interpeak"].append(-1)

    #Change C to C+57
    #if "cysteine_protease.cysteine" in parameters_obj:
    #    if parameters_obj["cysteine_protease.cysteine"][0] == "c57":
    #        #Lets replace all the cysteines
    #        for i in range(psm_rows):
    #            psm_table_data["sequence"][i] = psm_table_data["sequence"][i].replace("C", "C+57")

    ming_fileio_library.write_dictionary_table_data(psm_table_data,
                                                    output_psms_file)
def main():
    params_filename = sys.argv[1]
    output_peptide_folder = sys.argv[2]
    output_psm_folder = sys.argv[3]
    output_summary = sys.argv[4]
    params_dict = ming_proteosafe_library.parse_xml_file(open(params_filename))

    source_tasks_text = params_dict["tasks_to_consolidate"][0]

    if len(source_tasks_text) > 0:
        source_tasks_list = json.loads(source_tasks_text)
        grab_all_results(source_tasks_list, output_peptide_folder,
                         output_psm_folder, output_summary)
    else:
        open(output_summary, "w").write("None")
def main():
    input_folder_path = sys.argv[1]
    param_xml_filename = sys.argv[2]
    output_tsv = sys.argv[3]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)
    params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename))

    top_k = 1
    try:
        top_k = int(params_obj["TOP_K_RESULTS"][0])
    except:
        top_k = 1

    #merged_dict = defaultdict(list)
    merged_results = []

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file)
        for i in range(row_count):
            result_dict = {}
            for key in table_data:
                result_dict[key] = table_data[key][i]
            merged_results.append(result_dict)


    results_per_spectrum = defaultdict(list)

    for result_obj in merged_results:
        spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"]

        results_per_spectrum[spectrum_unique_key].append(result_obj)

    output_results = []
    for spectrum_unique_key in results_per_spectrum:
        sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True)
        filtered_results = sorted_results[:top_k]
        output_results += filtered_results

    output_dict = defaultdict(list)

    for result_obj in output_results:
        for key in result_obj:
            output_dict[key].append(result_obj[key])


    ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
Esempio n. 25
0
def main():
    parser = argparse.ArgumentParser(description='Creates bucket table')
    parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file')
    parser.add_argument('param_filename', help='param_filename')
    parser.add_argument('input_clusterinfosummary', help='input_clusterinfosummary')
    parser.add_argument('output_filename', help='output_filename')
    parser.add_argument('output_biom_filename', help='output_biom_filename')
    parser.add_argument('python_runtime', help='python_runtime')
    parser.add_argument('biom_run_script', help='biom_run_script')
    parser.add_argument('--metadata_folder', help='Metadata folder')
    args = parser.parse_args()

    # input_clusterinfo_file = sys.argv[1]
    # param_filename = sys.argv[2]
    # input_clusterinfosummary = sys.argv[3]
    # output_filename = sys.argv[4]
    # output_biom_filename = sys.argv[5]
    # python_runtime = sys.argv[6]
    # biom_run_script = sys.argv[7]

    input_clusterinfo_file = args.input_clusterinfo_file
    param_filename = args.param_filename
    input_clusterinfosummary = args.input_clusterinfosummary
    output_filename = args.output_filename
    output_biom_filename = args.output_biom_filename
    python_runtime = args.python_runtime
    biom_run_script = args.biom_run_script

    metadata_mapping = {}
    try:
        metadata_mapping = load_metadata_mapping(args.metadata_folder)
    except:
        metadata_mapping = {}

    create_buckets = True
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    try:
        if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1":
            create_buckets = False
    except:
        create_buckets = False

    if create_buckets:
        create_bucket_from_clusterinfo(input_clusterinfo_file, param_filename, input_clusterinfosummary, output_filename, metadata_mapping)
        create_biom_file(output_filename, output_biom_filename, python_runtime, biom_run_script)
    else:
        open(output_filename, "w").write("No Output")
        open(output_biom_filename, "w").write("No Output")
Esempio n. 26
0
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    input_spectra_folder = sys.argv[3]
    library_search_results_filename = sys.argv[4]
    output_matches_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_input_filename))

    try:
        if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1":
            output_map = {"EMPTY": []}
            ming_fileio_library.write_dictionary_table_data(
                output_map, output_matches_filename)
            exit(0)
    except:
        output_map = {"EMPTY": []}
        ming_fileio_library.write_dictionary_table_data(
            output_map, output_matches_filename)
        exit(0)

    #Loading a dict of identifications
    identifications_map = load_identification_file_as_map(
        library_search_results_filename)

    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    all_datasets = params_map["all_datasets"]

    all_matches = finding_matches_in_public_data(
        os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets,
        identifications_map)

    output_map = defaultdict(list)
    for match in all_matches:
        for key in match:
            output_map[key].append(match[key])

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_matches_filename)
def main():
    param_filename = sys.argv[1]
    metadata_folder = sys.argv[2]
    input_clusterinfo_file = sys.argv[3]
    input_clusterinfosummary = sys.argv[4]
    ili_stl_model_folder = sys.argv[5]
    output_ili_filename = sys.argv[6]
    view_ili_html_filename = sys.argv[7]

    create_output = True
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    try:
        if param_object["CREATE_ILI_OUTPUT"][0] != "1":
            create_output = False
    except:
        create_output = False

    if create_output:
        ili_stl_model_files_in_folder = ming_fileio_library.list_files_in_dir(ili_stl_model_folder)
        metadata_files_in_folder = ming_fileio_library.list_files_in_dir(metadata_folder)
        if len(metadata_files_in_folder) != 1:
            print("Metadata file not provided, cannot create ili compatible output without coordinates")
            exit(1)
        filename_coordinate_mapping = load_filename_to_coordinate_mapping(metadata_files_in_folder[0])
        create_ili_output_from_clusterinfo(input_clusterinfo_file, param_filename, input_clusterinfosummary, filename_coordinate_mapping, output_ili_filename)

        if len(ili_stl_model_files_in_folder) == 1:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("<script>\n")
            output_ili_html_file.write('window.location.replace("https://ili.embl.de/?https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_stl_model/ili_stl_model-00000.stl;https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_output/ili_quant.csv")\n' % (param_object["task"][0],param_object["task"][0]))
            output_ili_html_file.write("</script>\n")
            output_ili_html_file.close()

        if len(ili_stl_model_files_in_folder) == 0:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("No STL file uploaded, cannot directly link to ili\n")
            output_ili_html_file.close()

        if len(ili_stl_model_files_in_folder) > 1:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("Too many stl files uploaded\n")
            output_ili_html_file.close()
    else:
        open(output_ili_filename, "w").write("No Output")
        open(view_ili_html_filename, "w").write("ili output was not selected or no metadata file was provided")
def main():
    input_parameters_xml = sys.argv[1]
    output_filename = sys.argv[2]

    param_obj = ming_proteosafe_library.parse_xml_file(
        open(input_parameters_xml))

    print(param_obj.keys())

    peak_list_list = []
    metadata_list = []

    if "spec_on_server" in param_obj:
        peak_list_list += (param_obj["spec_on_server"])
    if "spec_on_server_group2" in param_obj:
        peak_list_list += (param_obj["spec_on_server_group2"])
    if "spec_on_server_group3" in param_obj:
        peak_list_list += (param_obj["spec_on_server_group3"])
    if "spec_on_server_group4" in param_obj:
        peak_list_list += (param_obj["spec_on_server_group4"])
    if "spec_on_server_group5" in param_obj:
        peak_list_list += (param_obj["spec_on_server_group5"])
    if "spec_on_server_group6" in param_obj:
        peak_list_list += (param_obj["spec_on_server_group6"])
    if "spec_on_server_group6" in param_obj:
        peak_list_list += (param_obj["spec_on_server_group6"])
    if "metadatafile" in param_obj:
        metadata_list += (param_obj["metadatafile"])

    params_dict = {}
    params_dict["desc"] = "GNPS - Data for Analysis For GNPS Job " + param_obj[
        "task"][0]
    params_dict["workflow"] = "MASSIVE-COMPLETE"
    params_dict["peak_list_files"] = ";".join(peak_list_list)
    params_dict["other_files"] = ";".join(metadata_list)

    output_file = open(output_filename, "w")
    output_file.write("filenames\tmetadatanames\ttask\n")
    output_file.write(";".join(peak_list_list))
    output_file.write("\t")
    output_file.write(";".join(metadata_list))
    output_file.write("\t")
    output_file.write(param_obj["task"][0])
    output_file.write("\n")
def main():
    paramxml_input_filename = sys.argv[1]
    pairs_info_filename = sys.argv[2]
    clusterinfo_filename = sys.argv[3]
    output_all_paths_filename = sys.argv[4]
    output_all_paths_histogram_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))
    try:
        if params_obj["CREATE_TOPOLOGY_SIGNATURES"][0] != "1":
            open(output_all_paths_filename, "w").write("NONE")
            open(output_all_paths_histogram_filename, "w").write("NONE")
            exit(0)
    except:
        open(output_all_paths_filename, "w").write("NONE")
        open(output_all_paths_histogram_filename, "w").write("NONE")
        exit(0)

    find_features_in_network(clusterinfo_filename, pairs_info_filename, output_all_paths_filename, output_all_paths_histogram_filename)
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    output_matches_filename = sys.argv[3]
    output_filename_unique_files = sys.argv[4]
    output_filename_all_matches = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_description" : [], "matchedpeaks" : [], "mzerror" : []}

    match_parameters = get_parameters(params_obj)

    try:
       if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
           ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
           exit(0)
    except:
       ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
       exit(0)

    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    dataset_dict = params_map["dataset_dict"]
    all_datasets = params_map["all_datasets"]

    SEARCH_RAW = False
    try:
        if params_obj["SEARCH_RAW"][0] == "1":
            SEARCH_RAW = True
    except:
        print("Param Not Found", "SEARCH_RAW")

    """Matchign Clustered Data"""
    if SEARCH_RAW:
        match_unclustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches)
    else:
        match_clustered(match_parameters, get_spectrum_collection_from_param_obj(params_obj), dataset_dict, all_datasets, output_matches_filename, output_filename_unique_files, output_filename_all_matches)
def main():
    parallel_json = json.loads(open(sys.argv[1]).read())
    params_filename = sys.argv[2]
    input_folder_of_results = sys.argv[3]
    output_folder = sys.argv[4]

    my_node = parallel_json["node_partition"]
    total_node = parallel_json["total_paritions"]

    all_input_files = ming_fileio_library.list_files_in_dir(input_folder_of_results)
    all_input_files.sort()

    ###
    ### TODO We will have to read parameters and see if we need to eliminate some PSMs, with PSM FDR filter, KL Filter, ambiguity score filter, unique intensity filter
    ###

    params_obj = ming_proteosafe_library.parse_xml_file(open(params_filename))
    total_file_count = 0
    all_input_files = all_input_files[my_node::total_node]
    current_working_psm_set = ming_psm_library.PSMset("Ming")

    for input_file in all_input_files:
        #Assume these are variant files
        #We can treat this like a psm file and then combine all of the as a new variants file
        total_file_count += 1
        print(input_file, total_file_count, "of", len(all_input_files))
        input_pickle = open(input_file, 'rb')
        temp_psm_set = pickle.load(input_pickle)
        print("Loaded", len(temp_psm_set.psms))

        for psm in temp_psm_set.psms:
            precursor_string = "%s:%d" % (psm.annotation, psm.charge)
            score = psm.score

            #Determine minimum score cutoff
            current_score = psm.sorting_value()
            peptide_length = len(psm.get_stripped_sequence())

            current_working_psm_set.psms.append(psm)

    #Saving out psms
    output_filename = os.path.join(output_folder, str(my_node) + ".psms")
    current_working_psm_set.write_output(open(output_filename, "w"), True)
Esempio n. 32
0
def main():
    parser = argparse.ArgumentParser(
        description='Invoking new workflow with parameters of given workflow')
    parser.add_argument('workflowparamters', help='workflowparamters')
    parser.add_argument('output_mgf', help='output_mgf')
    parser.add_argument('output_tsv', help='output_tsv')
    args = parser.parse_args()

    workflow_parameters_map = ming_proteosafe_library.parse_xml_file(
        open(args.workflowparamters))

    usi_list = workflow_parameters_map["usi_string"][0].split("\n")
    usi_list = [usi for usi in usi_list if len(usi) > 5]

    output_mgf = open(args.output_mgf, "w")

    output_results_list = []

    for i, usi in enumerate(usi_list):
        #Spectrum
        precursor_mz, peaks = _get_spectrum(usi)
        if precursor_mz == None:
            continue

        output_mgf.write("BEGIN IONS\n")
        output_mgf.write("TITLE=USI:{}\n".format(usi))
        output_mgf.write("PEPMASS={}\n".format(precursor_mz))
        output_mgf.write("CHARGE=0\n")
        output_mgf.write("SCANS={}\n".format(i + 1))
        for peak in peaks:
            output_mgf.write("{} {}\n".format(peak[0], peak[1]))
        output_mgf.write("END IONS\n")

        output_dict = {}
        output_dict["usi"] = usi
        output_dict["filename"] = args.output_mgf
        output_dict["scan"] = i + 1

        output_results_list.append(output_dict)

    df = pd.DataFrame(output_results_list)
    df.to_csv(args.output_tsv, sep="\t", index=False)
Esempio n. 33
0
def main():
    paramxml_filename = sys.argv[1]
    input_spectrum_filename = sys.argv[2]
    input_spectrum_all = sys.argv[3]
    psms_input_file = sys.argv[4]
    input_collision_energy_folder = sys.argv[5]
    output_psms_file = sys.argv[6]

    parameters_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_filename))
    scan_metadata_maps = load_collision_energy_mapping(input_collision_energy_folder)

    target_filename_list, decoy_filename_list = determine_set_of_target_and_decoy_spectrum_files(parameters_obj)

    input_psm_set = ming_psm_library.PSMset("input psms")
    input_psm_set.load_MSGF_Plus_tsvfile(psms_input_file)

    """Filtering on Collision Energy"""
    print("Size Before Filtering", len(input_psm_set.psms))
    filter_psms_to_acceptable_metadata(input_psm_set, scan_metadata_maps, parameters_obj)
    print("Size After CE Filtering", len(input_psm_set.psms))

    """Filtering to current file"""
    current_file_psms = get_psms_to_current_file(input_psm_set, input_spectrum_filename)
    target_file_psms = get_psms_to_target_file(input_psm_set, target_filename_list)
    print(len(current_file_psms), len(target_file_psms))

    output_decoys_list = []
    if os.path.basename(input_spectrum_filename) in target_filename_list:
        #no filtering, just save
        print("Target")
        output_decoys_list = target_file_psms
    else:
        #Find top scoring hit for each precursor

        blacklisted_decoy_peptides = json.loads(parameters_obj["blacklisted_decoy_peptides_json"][0])
        current_file_psms = filtering_out_blacklisted_decoys(current_file_psms, blacklisted_decoy_peptides)
        output_decoys_list = filtering_out_high_scoring_decoys(current_file_psms, target_file_psms, os.path.join(input_spectrum_all, target_filename_list[0]), input_spectrum_filename)

    output_decoys_list = filtering_redundant_identifications_per_scan(output_decoys_list)
    input_psm_set.psms = output_decoys_list

    input_psm_set.write_output(open(output_psms_file, "w"))
def main():
    input_file_of_tsv_results = sys.argv[1]
    input_params_xml_filename = sys.argv[2]
    input_library_identifications_filename = sys.argv[3]
    input_cutoff_scores = sys.argv[4]
    output_folder = sys.argv[5]

    output_filename = os.path.join(output_folder, os.path.basename(input_file_of_tsv_results))

    params_object = ming_proteosafe_library.parse_xml_file(open(input_params_xml_filename))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    library_scans_to_identification = library_scans_to_identification_info(input_library_identifications_filename)

    cutoff_dict = json.loads(open(input_cutoff_scores).read())

    psm_list = ming_psm_library.parse_MSGFPlus_tsvfile(input_file_of_tsv_results)
    output_results_dict = process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict)

    ming_fileio_library.write_dictionary_table_data(output_results_dict, output_filename)
def name_demangle_filenames(input_file, output_file, path_to_param,
                            old_filename_header, new_filename_header):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_file)
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        ming_proteosafe_library.parse_xml_file(open(path_to_param)))

    if old_filename_header == new_filename_header:
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header][i] = unmangled_name
    else:
        table_data[new_filename_header] = []
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header].append(unmangled_name)

    ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('library_folder', help='Input mgf file to network')
    parser.add_argument('workflow_parameters', help='proteosafe xml parameters')
    parser.add_argument('parameters_output_folder', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))

    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)

    for i in range(args.parallelism):
        output_parameter_file = open(os.path.join(args.parameters_output_folder, str(i) + ".params"), "w")
        #Search Criteria
        output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0]))
        output_parameter_file.write("TOP_K_RESULTS=%s\n" % (params_object["TOP_K_RESULTS"][0]))
        output_parameter_file.write("search_peak_tolerance=%s\n" % (params_object["tolerance.Ion_tolerance"][0]))
        output_parameter_file.write("search_parentmass_tolerance=%s\n" % (params_object["tolerance.PM_tolerance"][0]))
        output_parameter_file.write("ANALOG_SEARCH=%s\n" % (params_object["ANALOG_SEARCH"][0]))
        output_parameter_file.write("MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0]))
        output_parameter_file.write("SEARCH_LIBQUALITY=%s\n" % (params_object["SEARCH_LIBQUALITY"][0]))

        #Filtering Criteria
        output_parameter_file.write("FILTER_PRECURSOR_WINDOW=%s\n" % (params_object["FILTER_PRECURSOR_WINDOW"][0]))
        output_parameter_file.write("MIN_PEAK_INT=%s\n" % (params_object["MIN_PEAK_INT"][0]))
        output_parameter_file.write("WINDOW_FILTER=%s\n" % (params_object["WINDOW_FILTER"][0]))
        output_parameter_file.write("FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0]))

        output_parameter_file.write("NODEIDX=%d\n" % (i))
        output_parameter_file.write("NODECOUNT=%d\n" % (args.parallelism))


        #For GC
        output_parameter_file.write("FORCE_EXACT_MATCH=%s\n" % (params_object["FORCE_EXACT_MATCH"][0]))

        #Libraries
        output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" % (" ".join(library_files)))

        output_parameter_file.close()
def main():
    parser = argparse.ArgumentParser(description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clustersummary', help='input_clustersummary')
    parser.add_argument('input_clusterinfo', help='input_clusterinfo')
    parser.add_argument('output_clusterinfo', help='output_clusterinfo')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)


    #Creating acceptable clusters to include in cluster info
    included_clusters = set()
    for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'):
        included_clusters.add(row["cluster index"])

    with open(args.input_clusterinfo) as input_clusterinfo:
        field_names = ["cluster index", "AllFiles", "sum(precursor intensity)", "RTMean", "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath", "Original_Path"]
        output_clusterinfo_writer = csv.DictWriter(open(args.output_clusterinfo, "w"), fieldnames=field_names, delimiter='\t')
        output_clusterinfo_writer.writeheader()

        input_clusterinfo_reader = csv.DictReader(input_clusterinfo, delimiter='\t')
        for row in input_clusterinfo_reader:
            if not (row["#ClusterIdx"] in included_clusters):
                continue
            output_dict = {}
            output_dict["cluster index"] = row["#ClusterIdx"]
            output_dict["AllFiles"] = row["#Filename"]
            output_dict["sum(precursor intensity)"] = row["#PrecIntensity"]
            output_dict["RTMean"] = row["#RetTime"]
            output_dict["RTStdErr"] = "0"
            output_dict["parent mass"] = row["#ParentMass"]
            output_dict["ScanNumber"] = row["#Scan"]
            output_dict["ProteosafeFilePath"] = os.path.join("spec", os.path.basename(row["#Filename"]))
            output_dict["Original_Path"] = "f." + mangled_mapping[os.path.basename(row["#Filename"])]
            output_clusterinfo_writer.writerow(output_dict)

    exit(0)
Esempio n. 38
0
def main():
    paramxml_input_filename = sys.argv[1]
    pairs_info_filename = sys.argv[2]
    clusterinfo_filename = sys.argv[3]
    output_all_paths_filename = sys.argv[4]
    output_all_paths_histogram_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_input_filename))
    try:
        if params_obj["CREATE_TOPOLOGY_SIGNATURES"][0] != "1":
            open(output_all_paths_filename, "w").write("NONE")
            open(output_all_paths_histogram_filename, "w").write("NONE")
            exit(0)
    except:
        open(output_all_paths_filename, "w").write("NONE")
        open(output_all_paths_histogram_filename, "w").write("NONE")
        exit(0)

    find_features_in_network(clusterinfo_filename, pairs_info_filename,
                             output_all_paths_filename,
                             output_all_paths_histogram_filename)
Esempio n. 39
0
def name_demangle_filenames_and_instrument_collision(input_file, output_file,
                                                     path_to_param,
                                                     path_to_original_results,
                                                     old_filename_header,
                                                     new_filename_header):
    row_count, table_data = ming_fileio_library.parse_table_with_headers(
        input_file, skip_incomplete_lines=True)
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        ming_proteosafe_library.parse_xml_file(open(path_to_param)))

    if not "FragMethod" in table_data:
        print("Demangling", path_to_original_results, input_file)
        collision_mapping = get_scan_mapping_for_collision_method(
            path_to_original_results)

        #Adding collision column
        table_data["FragMethod"] = []
        print(len(table_data["filename"]), len(table_data["scan"]))
        for i in range(row_count):
            key = table_data["filename"][i] + "_" + table_data["scan"][i]
            if key in collision_mapping:
                table_data["FragMethod"].append(collision_mapping[key])
            else:
                table_data["FragMethod"].append("NO_COLLISION")

    if old_filename_header == new_filename_header:
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header][i] = unmangled_name
    else:
        table_data[new_filename_header] = []
        for i in range(row_count):
            mangled_name = table_data[old_filename_header][i]
            unmangled_name = mangled_mapping[mangled_name]
            table_data[new_filename_header].append(unmangled_name)

    ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main():
    paramxml_input_filename = sys.argv[1]
    output_json_folder = sys.argv[2]
    parallelism = int(sys.argv[3])

    params_obj = ming_proteosafe_library.parse_xml_file(
        open(paramxml_input_filename))

    try:
        if params_obj["MATCH_REFERENCE_DATASETS"][0] != "1":
            parallelism = 1
    except:
        parallelism = 1
    all_datasets = []
    try:
        temp_datasets = ming_proteosafe_library.get_all_datasets()

        #Filtering datasets to reference datasets
        for dataset in temp_datasets:
            if dataset["title"].find("GNPS_ref_") != -1:
                all_datasets.append(dataset)

    except:
        all_datasets = []

    for i in range(parallelism):
        output_map = {"node_partition": i, "total_paritions": parallelism}
        partitioned_datasets = all_datasets[i::parallelism]
        output_map["all_datasets"] = partitioned_datasets

        dataset_map = {}
        for dataset in partitioned_datasets:
            dataset_map[dataset["dataset"]] = dataset

        output_map["dataset_dict"] = dataset_map
        output_filename = os.path.join(output_json_folder, str(i) + ".json")
        open(output_filename, "w").write(json.dumps(output_map))
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('mgf_filename', help='Input mgf file to network')
    parser.add_argument('workflow_parameters', help='proteosafe xml parameters')
    parser.add_argument('parameters_output_folder', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))

    #Determing number of spectra in mgf file
    number_of_spectra = number_scans_in_mgf_file(args.mgf_filename)

    parallelism = args.parallelism
    if parallelism > number_of_spectra:
        parallelism = 1

    number_per_partition = int(number_of_spectra/parallelism)
    for i in range(parallelism):
        output_parameter_file = open(os.path.join(args.parameters_output_folder, str(i) + ".params"), "w")
        output_parameter_file.write("ALIGNS_FORMAT=%s\n" % ("tsv"))
        output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0]))
        output_parameter_file.write("TOLERANCE_PEAK=%s\n" % (params_object["tolerance.Ion_tolerance"][0]))
        output_parameter_file.write("TOLERANCE_PM=%s\n" % (params_object["tolerance.PM_tolerance"][0]))
        output_parameter_file.write("PAIRS_MIN_COSINE=%s\n" % (params_object["PAIRS_MIN_COSINE"][0]))
        output_parameter_file.write("MAX_SHIFT=%s\n" % (params_object["MAX_SHIFT"][0]))
        output_parameter_file.write("INPUT_SPECTRA_MS2=%s\n" % (args.mgf_filename))


        start_idx = number_per_partition * i
        end_idx = number_per_partition * (i + 1) - 1
        if i == parallelism - 1:
            end_idx = number_of_spectra

        output_parameter_file.write("IDX_START=%d\n" % (start_idx))
        output_parameter_file.write("IDX_END=%d\n" % (end_idx))
def main():
    parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('groupmapping_folder', help='groupmapping_folder')
    parser.add_argument('attributemapping_folder', help='attributemapping_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_groupmapping_file', help='output_groupmapping_file')
    parser.add_argument('output_attributemapping_file', help='output_attributemapping_file')
    parser.add_argument('inputspectrafolder', help='inputspectrafolder')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))
    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)
    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj)
    file_path_prefix = args.inputspectrafolder

    output_group_file = open(args.output_groupmapping_file, "w")
    output_attribute_file = open(args.output_attributemapping_file, "w")

    """
    Writing Default Grouping to output file
    """
    default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] }
    for mangled_name in mangled_file_mapping.keys():
        if mangled_name.find("spec-") != -1:
            default_groupings['G1'].append(mangled_name.rstrip())
        if mangled_name.find("spectwo-") != -1:
            default_groupings['G2'].append(mangled_name.rstrip())
        if mangled_name.find("specthree-") != -1:
            default_groupings['G3'].append(mangled_name.rstrip())
        if mangled_name.find("specfour-") != -1:
            default_groupings['G4'].append(mangled_name.rstrip())
        if mangled_name.find("specfive-") != -1:
            default_groupings['G5'].append(mangled_name.rstrip())
        if mangled_name.find("specsix-") != -1:
            default_groupings['G6'].append(mangled_name.rstrip())

    for default_group_key in default_groupings.keys():
        default_group_string = ""
        default_group_string += "GROUP_" + default_group_key +"="
        for mangled_name in default_groupings[default_group_key]:
            default_group_string += os.path.join(file_path_prefix, mangled_name) + ";"
        if len(default_groupings[default_group_key]) > 0:
            default_group_string = default_group_string[:-1]
        output_group_file.write(default_group_string + "\n")


    """Determining output whether to use group mapping file or metadata file"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder)
    attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder)

    if len(metadata_files_in_folder) > 1:
        print("Too many metafile inputted")
        exit(1)
    if len(metadata_files_in_folder) == 1:
        #Using metadatat file
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

        if not "filename" in table_data:
            print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename")
            exit(1)

        attributes_to_groups_mapping = defaultdict(set)
        group_to_files_mapping = defaultdict(list)
        for i in range(row_count):
            filename = table_data["filename"][i]
            basename_filename = os.path.basename(filename).rstrip()
            if basename_filename in reverse_file_mangling:
                mangled_name = reverse_file_mangling[basename_filename]
                for key in table_data:
                    if key.find("ATTRIBUTE_") != -1:
                        group_name = table_data[key][i]
                        if len(group_name) < 1:
                            continue
                        group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name))
                        attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name)
            else:
                #Filename is not part of sample set
                continue

        for group_name in group_to_files_mapping:
            group_string = "GROUP_" + group_name + "="  + ";".join(group_to_files_mapping[group_name])
            output_group_file.write(group_string + "\n")

        for attribute_name in attributes_to_groups_mapping:
            attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name]))
            output_attribute_file.write(attribute_string + "\n")
        exit(0)

    """Falling back on old group mapping file"""
    if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1:
        print("Too many group/attribute mappings inputted")
        exit(1)

    if len(groupmapping_files_in_folder) == 1:
        for line in open(groupmapping_files_in_folder[0], errors='ignore'):
            splits = line.rstrip().split("=")
            if len(splits) < 2:
                continue

            group_name = splits[0]
            group_files = []
            for filename in splits[1].split(";"):
                if os.path.basename(filename) in reverse_file_mangling:
                    mangled_name = reverse_file_mangling[os.path.basename(filename)]
                    group_files.append(os.path.join(file_path_prefix, mangled_name))

            group_string = group_name + "=" + ";".join(group_files)
            output_group_file.write(group_string + "\n")

    if len(attributemapping_files_in_folder) == 1:
        for line in open(attributemapping_files_in_folder[0]):
            output_attribute_file.write(line)
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('consensus_feature_file', help='Consensus Quantification File')
    parser.add_argument('metadata_folder', help='metadata metadata_folder')
    parser.add_argument('mgf_filename', help='mgf_filename')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    task_id = param_obj["task"][0]

    group_to_files_mapping = defaultdict(list)
    attributes_to_groups_mapping = defaultdict(set)

    metadata_files = glob.glob(os.path.join(args.metadata_folder, "*"))
    if len(metadata_files) == 1:
        group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(metadata_files[0])

    ROW_NORMALIZATION = "None"
    try:
        ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0]
    except:
        ROW_NORMALIZATION = "None"

    GROUP_COUNT_AGGREGATE_METHOD = "Sum"
    try:
        GROUP_COUNT_AGGREGATE_METHOD = param_obj["GROUP_COUNT_AGGREGATE_METHOD"][0]
    except:
        GROUP_COUNT_AGGREGATE_METHOD = "None"


    quantification_list = ming_fileio_library.parse_table_with_headers_object_list(args.consensus_feature_file, delimiter=",")
    input_filenames, input_filename_headers = determine_input_files(quantification_list[0].keys())

    ### Filling in Quantification table if it is missing values
    for quantification_object in quantification_list:
        ###Handling empty quantification
        for filename in input_filename_headers:
            try:
                if len(quantification_object[filename]) == 0:
                    #print(filename, quantification_object[filename], quantification_object["row ID"])
                    quantification_object[filename] = 0
            except:
                x = 1

    print("Number of Features", len(quantification_list))

    #Doing row sum normalization
    if ROW_NORMALIZATION == "RowSum":
        print("ROW SUM NORM")
        for filename_header in input_filename_headers:
            file_quants = [float(quantification_object[filename_header]) for quantification_object in quantification_list]
            for quantification_object in quantification_list:
                quantification_object[filename_header] = float(quantification_object[filename_header]) / sum(file_quants)

    """Loading MS2 Spectra"""
    mgf_collection = ming_spectrum_library.SpectrumCollection(args.mgf_filename)
    mgf_collection.load_from_file()

    clusters_list = []
    for quantification_object in quantification_list:

        cluster_obj = {}
        cluster_obj["cluster index"] = quantification_object["row ID"]
        cluster_obj["precursor mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]))
        cluster_obj["RTConsensus"] = "{0:.4f}".format(float(quantification_object["row retention time"]))

        all_charges = []

        """Checking about the charge of this cluster"""
        try:
            spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
            charge = int(spectrum_object.charge)
        except:
            charge = 0

        """Checking if this spectrum has no peaks"""
        # try:
        #     spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
        #
        # except:
        #     continue

        all_files = [os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0]
        abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers]
        all_abundances = [float(quantification_object[filename]) for filename in input_filename_headers]

        if charge != 0:
            cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]) * charge - charge + 1)
        else:
            cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]))
        cluster_obj["precursor charge"] = charge

        try:
            cluster_obj["RTMean"] = statistics.mean(all_retention_times)
            cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times)
        except:
            cluster_obj["RTMean"] = cluster_obj["RTConsensus"]
            cluster_obj["RTStdErr"] = 0

        cluster_obj["GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (task_id, quantification_object["row ID"], quantification_object["row ID"])
        #cluster_obj["AllFiles"] = "###".join(all_files)

        cluster_obj["sum(precursor intensity)"] = sum(all_abundances)
        cluster_obj["SumPeakIntensity"] = sum(all_abundances)
        cluster_obj["number of spectra"] = len(all_files)
        cluster_obj["UniqueFileSourcesCount"] = len(all_files)

        group_abundances = determine_group_abundances(group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD)

        default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"]
        for group in group_to_files_mapping:
            group_header = "GNPSGROUP:" + group
            if group in default_groups:
                continue
            cluster_obj[group_header] = group_abundances[group]

        for group in default_groups:
            cluster_obj[group] = group_abundances[group]

        #Writing attributes
        for attribute in attributes_to_groups_mapping:
            groups_to_include = []
            for group in attributes_to_groups_mapping[attribute]:
                if group_abundances[group] > 0.0:
                    groups_to_include.append(group)
            if len(groups_to_include) == 0:
                cluster_obj[attribute] = ""
            else:
                cluster_obj[attribute] = ",".join(groups_to_include)


        """
        Enriching the cluster info with adduct collapsing information
        """
        enrich_adduct_annotations(cluster_obj, quantification_object)


        clusters_list.append(cluster_obj)

    ming_fileio_library.write_list_dict_table_data(clusters_list, args.output_clusterinfo_summary)
import sys
import os
import ming_proteosafe_library


param_obj = ming_proteosafe_library.parse_xml_file(open(sys.argv[1]))
output_filename = sys.argv[2]

output_sentences = []
output_sentences.append("<strong>Network Description</strong><br><br>\n\n")
output_sentences.append("A molecular network was created with the feature based molecular networking workflow (https://ccms-ucsd.github.io/GNPSDocumentation/featurebasedmolecularnetworking/) on the GNPS website (http://gnps.ucsd.edu).")
if param_obj["FILTER_PRECURSOR_WINDOW"][0] == "1":
    output_sentences.append("The data was filtered by removing all MS/MS fragment ions within +/- 17 Da of the precursor m/z.")
if param_obj["WINDOW_FILTER"][0] == "1":
    output_sentences.append("MS/MS spectra were window filtered by choosing only the top 6 fragment ions in the +/- 50Da window throughout the spectrum.")
output_sentences.append("The precursor ion mass tolerance was set to %s Da and a MS/MS fragment ion tolerance of %s Da." % (param_obj["tolerance.PM_tolerance"][0], param_obj["tolerance.Ion_tolerance"][0]))
output_sentences.append("A network was then created where edges were filtered to have a cosine score above %s and more than %s matched peaks." % (param_obj["PAIRS_MIN_COSINE"][0], param_obj["MIN_MATCHED_PEAKS"][0]))
output_sentences.append("Further, edges between two nodes were kept in the network if and only if each of the nodes appeared in each other's respective top %s most similar nodes." % (param_obj["TOPK"][0]))
output_sentences.append("Finally, the maximum size of a molecular family was set to %s, and the lowest scoring edges were removed from molecular families until the molecular family size was below this threshold." % (param_obj["MAXIMUM_COMPONENT_SIZE"][0]))
output_sentences.append("The spectra in the network were then searched against GNPS' spectral libraries.")
if param_obj["FILTER_LIBRARY"][0] == "1":
    output_sentences.append("The library spectra were filtered in the same manner as the input data.")
output_sentences.append("All matches kept between network spectra and library spectra were required to have a score above %s and at least %s matched peaks." % (param_obj["SCORE_THRESHOLD"][0], param_obj["MIN_MATCHED_PEAKS_SEARCH"][0]))
output_sentences.append("<br><br>\n<strong>Citation</strong><br><br>\n")
output_sentences.append('Wang, Mingxun, et al. "Sharing and community curation of mass spectrometry data with Global Natural Products Social Molecular Networking." Nature Biotechnology 34.8 (2016): 828-837. PMID: 27504778, https://www.nature.com/articles/nbt.3597')

open(output_filename, "w").write(" ".join(output_sentences))
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")


    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main():
    parser = argparse.ArgumentParser(description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file')
    parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file')
    parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename')
    parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename')
    parser.add_argument('input_networking_pairs', help='input_networking_pairs')
    parser.add_argument('input_library_search', help='input_library_search')
    parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename')
    args = parser.parse_args()

    """Loading group filenames"""
    group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename)
    print("Loaded Group Mapping")
    cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file)
    print("Loaded Cluster Summary")

    attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename)

    params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0])
    RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0]

    #Calculating the spectrum counts per group
    cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0))
    cluster_to_files = defaultdict(set)
    cluster_to_RT = defaultdict(list)
    line_count = 0
    for line in open(args.input_clusterinfo_file):
        line_count += 1
        if line_count == 1:
            continue
        if line_count % 10000 == 0:
            print(line_count)

        splits = line.rstrip().split("\t")
        cluster_index = splits[0]
        filename = os.path.basename(splits[1])
        rt = float(splits[6])

        group_membership = files_to_groups[filename]
        cluster_to_files[cluster_index].add(filename)
        cluster_to_RT[cluster_index].append(rt)

        for group in group_membership:
            cluster_to_group_counts[cluster_index][group] += 1

    if RUN_MSCLUSTER == "on":
        cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE)

    print(len(cluster_summary_list))

    print("Setting up grouping", len(group_to_files.keys()))
    for cluster_summary_object in cluster_summary_list:
        cluster_index = cluster_summary_object["cluster index"]
        for group in group_to_files:
            group_count = 0
            if group in cluster_to_group_counts[cluster_index]:
                group_count = cluster_to_group_counts[cluster_index][group]
            cluster_summary_object[group] = group_count

        for attribute in attribute_to_groups:
            groups_to_include = []
            for group in attribute_to_groups[attribute]:
                if group in cluster_summary_object:
                    if cluster_summary_object[group] > 0:
                        groups_to_include.append(group)

            cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "")


    print("Default Attributes")
    calculate_default_attributes(cluster_summary_list, group_to_files.keys())

    print("calculate_cluster_file_stats")
    calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping)

    print("rt stats")
    calculate_rt_stats(cluster_summary_list, cluster_to_RT)

    print("calculate_ancillary_information")
    calculate_ancillary_information(cluster_summary_list, params_object["task"][0])

    print("populate_network_component")
    populate_network_component(cluster_summary_list, args.input_networking_pairs)

    print("populate_network_identifications")
    populate_network_identifications(cluster_summary_list, args.input_library_search)

    ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
def main():
    parser = argparse.ArgumentParser(description='Running sirius wrapper')
    parser.add_argument('libFiles', help='input')
    parser.add_argument('input', help='input')
    parser.add_argument('input_filtered', help='input_filtered')
    parser.add_argument('workflow_parameters', help='workflow_parameters')
    parser.add_argument('carbonMarker', help='Carbon_Marker_File')
    parser.add_argument('result_nonfiltered', help='Kovats_Result_Nonfiltered')
    parser.add_argument('result_filtered', help='Kovats_Result_Nonfiltered')

    args = parser.parse_args()
    lib = args.libFiles
    param = args.workflow_parameters
    input = args.input
    input_filtered = args.input_filtered
    carbonMarker = args.carbonMarker
    result_nonfiltered = args.result_nonfiltered
    result_filtered = args.result_filtered

    #parse params
    params_obj = ming_proteosafe_library.parse_xml_file(open(param))
    try:
        cosineScore = float(params_obj["Kovats_Filter_Cosine_Threshold"][0])
    except:
        cosineScore = 0.9
    try:
        errorFilter = float(params_obj["Error_Filter_Threshold"][0])/100
    except:
        errorFilter = 0.1
    try:
        if params_obj["runKovats"][0] == "on":
            optin = True
    except:
        optin = False
    '''try:
        minimunFeature = int(params_obj["polyFitting_data_point"][0])
    except:
        minimunFeature = 10'''
    # set minimumFeature to be 10 currently
    minimunFeature = 10
    if not optin:
        empty_tsv = open(result,'w')
        empty_tsv.write('Kovats Calculation Opt Out')
        return
    #if there is no csv file
    if carbonMarker == '':
        supporting_file = polyFitting.getParams(input_filtered,cosineScore,1.5,lib,minimunFeature)
        if supporting_file is None:
            empty_tsv = open(result,'w')
            empty_tsv.write('Not enough data for polynomial fitting')
            return
        mode = 'p'
        #try:
        #    supporting_file = polyFitting.getParams(input,cosineScore,1.5)
        #    mode = 'p'
        #except:
        #    empty_tsv = open(result,'w')
        #    empty_tsv.write(param+'\n')
        #    empty_tsv.write(input+'\n')
        #    empty_tsv.write(carbonMarker+'\n')
        #    empty_tsv.write(result+'\n')
        #    return
    else:
        supporting_file = carbonMarker
        mode = 'm'
    #try:
    #    mapping.csv_builder(input,mode,supporting_file,cosineScore,errorFilter,result,lib)
    #except:
    #    empty_tsv = open(result,'w')
    #    empty_tsv.write('48,exit')
    #    return
    mapping.csv_builder(input,mode,supporting_file,cosineScore,errorFilter,result_nonfiltered,result_filtered,lib)
	def load_parameters_file(self, paramsfilename):
		#Loading the file mapping
		parameters = ming_proteosafe_library.parse_xml_file(open(paramsfilename, "r"))
		mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(parameters)
		self.mangled_mapping = mangled_mapping
Esempio n. 49
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('cluster_buckets', help='cluster_buckets')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_folder', help='output_folder')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r"))

    if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0":
        print("Do not do things")
        exit(0)

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object)

    """Reading Metadata File"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename" : real_name})
    else:
        print(metadata_files_in_folder[0])
        object_list = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0])
        if len(object_list) == 0:
            for real_name in reverse_file_mangling:
                mangled_name = reverse_file_mangling[real_name]
                if mangled_name.find("spec") == -1:
                    continue
                object_list.append({"filename" : real_name})

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            if "#SampleID" in metadata_object:
                metadata_object["#SampleID"] = metadata_object["#SampleID"]
            else:
                #Stripping off all non-alphanumeric characters
                metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum())
        if not "Description" in metadata_object:
            metadata_object["Description"] = "LoremIpsum"
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        try:
            mangled_name = reverse_file_mangling[metadata_object["filename"]]
            if mangled_name.find("spec-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
            elif mangled_name.find("spectwo-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
            elif mangled_name.find("specthree-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
            elif mangled_name.find("specfour-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
            elif mangled_name.find("specfive-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
            elif mangled_name.find("specsix-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"
        except:
            print(metadata_object["filename"], "Not Mapped")
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped"

    output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv")

    for metadatum in object_list:
        if "sample_name" in metadatum:
            if len(metadatum["sample_name"]) > 1:
                metadatum["#SampleID"] = metadatum["sample_name"]

    metadata_df = pd.DataFrame(object_list)
    metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list)

    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = metadata_df["#SampleID"]
    manifest_df["filepath"] = metadata_df["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    """Calling remote server to do the calculation"""
    SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024"
    #SERVER_BASE = "http://mingwangbeta.ucsd.edu:5024"
    files = {'manifest': open(output_manifest_filename, 'r'), \
    'metadata': open(output_metadata_filename, 'r'), \
    'bucket': open(args.cluster_buckets, 'r')}


    r_post = requests.post(SERVER_BASE + "/processclassic", files=files)
    response_dict = r_post.json()

    with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

    with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
def main():
    paramxml_input_filename = sys.argv[1]
    parallel_param_filename = sys.argv[2]
    input_spectra_folder = sys.argv[3]
    library_search_results_filename = sys.argv[4]
    output_matches_filename = sys.argv[5]

    params_obj = ming_proteosafe_library.parse_xml_file(open(paramxml_input_filename))

    output_map = {"specs_filename" : [],"specs_scan" : [], "dataset_filename" : [], "dataset_scan" : [], "score" : [], "dataset_id" : [], "dataset_title" : [], "dataset_neighbors" : [], "Compound_Name" : [], "SpectrumID" : []}

    try:
       if params_obj["FIND_MATCHES_IN_PUBLIC_DATA"][0] != "1":
           ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
           exit(0)
    except:
       ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
       exit(0)


    #If we are doing parallel
    partition_total = 1
    partition_of_node = 0
    params_map = json.loads(open(parallel_param_filename).read())
    partition_total = params_map["total_paritions"]
    partition_of_node = params_map["node_partition"]

    dataset_dict = params_map["dataset_dict"]
    all_datasets = params_map["all_datasets"]

    #print(len(all_datasets))
    #print(partition_of_node)
    #print(partition_total)

    #all_datasets = all_datasets[partition_of_node::partition_total]

    all_matches = finding_matches_in_public_data(os.path.join(input_spectra_folder, "specs_ms.mgf"), all_datasets)

    #Lets parse the search results and then populate this thing with search results
    library_search_result_count, library_search_data = ming_fileio_library.parse_table_with_headers(library_search_results_filename)
    scan_to_library_map = {}
    for i in range(library_search_result_count):
        scan = library_search_data["Scan"][i]
        scan_to_library_map[scan] = {"Compound_Name" : library_search_data["Compound_Name"][i], "SpectrumID" : library_search_data["SpectrumID"][i]}

    for dataset in all_matches:
        #For each dataset, lets try to find the clustering information
        if len(all_matches[dataset]["matches"]) == 0:
            continue

        most_recent_molecular_networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(dataset_dict[dataset]["task"])
        molecular_network = get_molecular_network_obj(most_recent_molecular_networking_job)

        for match in all_matches[dataset]["matches"]:
            output_map['specs_filename'].append("specs_ms.mgf")
            output_map['specs_scan'].append(match.query_scan)
            output_map['dataset_id'].append(dataset_dict[dataset]["dataset"])
            output_map['dataset_title'].append(dataset_dict[dataset]["title"])
            output_map['dataset_filename'].append(match.filename)
            output_map['dataset_scan'].append(match.scan)
            output_map['score'].append(match.score)

            #List the library identifications
            if str(match.query_scan) in scan_to_library_map:
                output_map['Compound_Name'].append(scan_to_library_map[str(match.query_scan)]["Compound_Name"])
                output_map['SpectrumID'].append(scan_to_library_map[str(match.query_scan)]["SpectrumID"])
            else:
                output_map['Compound_Name'].append("")
                output_map['SpectrumID'].append("")

            #Lets find all the analogs available
            if molecular_network != None:
                neighbors_in_dataset = molecular_network.get_node_neighbors(match.scan)
                output_map['dataset_neighbors'].append(len(neighbors_in_dataset))
            else:
                output_map['dataset_neighbors'].append(0)



    ming_fileio_library.write_dictionary_table_data(output_map, output_matches_filename)
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)



    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
Esempio n. 52
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)