Example #1
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('filestats', help='filestats')
    parser.add_argument('dbresults', help='dbresults')
    parser.add_argument('output_filestats', help='output folder for parameters')
    args = parser.parse_args()

    identified_spectra_in_filename = defaultdict(set)

    all_identifications = ming_fileio_library.parse_table_with_headers_object_list(args.dbresults)
    for identification in all_identifications:
        filename = identification["full_CCMS_path"]
        scan = identification["#Scan#"]

        identified_spectra_in_filename[filename].add(scan)

    print(identified_spectra_in_filename)

    output_list = []
    file_summaries = ming_fileio_library.parse_table_with_headers_object_list(args.filestats)

    for file_summary in file_summaries:
        filename = file_summary["full_CCMS_path"]
        count = len(identified_spectra_in_filename[filename])
        file_summary["identified_ms2"] = count
        percent_identified = 0
        try:
            percent_identified = float(count) / float(file_summary["MS2s"])
        except:
            percent_identified = 0

        file_summary["percent_identified"] = percent_identified
        output_list.append(file_summary)

    ming_fileio_library.write_list_dict_table_data(output_list, args.output_filestats)
def main():
    input_library_identifications = sys.argv[1]
    output_library_identifications = sys.argv[2]

    annotations_list = ming_fileio_library.parse_table_with_headers_object_list(input_library_identifications)

    already_identified_compounds = set()
    already_identified_spectra = set()

    annotations_list = sorted(annotations_list, key=lambda identification: float(identification["MQScore"]), reverse=True)

    output_annotation_list = []
    for annotation in annotations_list:
        compound_name = annotation["Compound_Name"]
        spectrum_identifier = annotation["#Scan#"] + ":" + annotation["SpectrumFile"]

        if compound_name in already_identified_compounds:
            continue
        if spectrum_identifier in already_identified_spectra:
            continue

        print(compound_name, spectrum_identifier)

        output_annotation_list.append(annotation)
        already_identified_compounds.add(compound_name)
        already_identified_spectra.add(spectrum_identifier)

    ming_fileio_library.write_list_dict_table_data(output_annotation_list, output_library_identifications)
def main():
    parser = argparse.ArgumentParser(description='Creates alan table')
    parser.add_argument('input_identifications_filename', help='input_identifications_filename')
    parser.add_argument('output_filename', help='output_filename')
    args = parser.parse_args()

    print(args.input_identifications_filename)

    data_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_identifications_filename)

    all_filenames = set()
    compounds_to_files = defaultdict(set)
    for data_object in data_list:
        query_filename = "f." + data_object["full_CCMS_path"]
        compound_name = data_object["Compound_Name"]
        all_filenames.add(query_filename)
        compounds_to_files[compound_name].add(query_filename)

    output_list = []
    for compound_name in compounds_to_files:
        output_dict = {}
        output_dict["LibraryID"] = compound_name
        output_dict["TotalFiles"] = len(compounds_to_files[compound_name])
        for filename in compounds_to_files[compound_name]:
            output_dict[filename] = "1"

        for filename in all_filenames:
            if not filename in output_dict:
                output_dict[filename] = "0"

        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list, args.output_filename)
Example #4
0
def simple_presence_of_merged_spectra_processing(input_integrals_filename, output_clusterinfo_filename, mangled_mapping):
    extension_stripped_mangled_mapping = {}
    for key in mangled_mapping:
        without_ext = ming_fileio_library.get_filename_without_extension(key)
        extension_stripped_mangled_mapping[without_ext] = mangled_mapping[key]


    header_order = open(input_integrals_filename).readline().rstrip().split(",")[1:]

    table_list = ming_fileio_library.parse_table_with_headers_object_list(input_integrals_filename, delimiter=",")
    #Removing other header infroamtion
    table_list = table_list[2:]

    output_dict = defaultdict(list)

    print("for zheng's sanity print the wholetable ----")
    print(table_list)
    for result_object in table_list:
        try:
            sample_name = result_object["RTS:"]
        except:
            sample_name = "unknown"
        scan_number = 0
        for header in header_order:
            scan_number += 1
            abundance = result_object[header]
            output_dict["filename"].append( sample_name )
            output_dict["abundance"].append( abundance )
            output_dict["scan_number"].append( scan_number )
            output_dict["RT"].append( header )

    ming_fileio_library.write_dictionary_table_data(output_dict, output_clusterinfo_filename)
def main():
    input_library_identifications = sys.argv[1]
    output_library_identifications = sys.argv[2]

    annotations_list = ming_fileio_library.parse_table_with_headers_object_list(
        input_library_identifications)

    already_identified_compounds = set()
    already_identified_spectra = set()

    annotations_list = sorted(
        annotations_list,
        key=lambda identification: float(identification["MQScore"]),
        reverse=True)

    output_annotation_list = []
    for annotation in annotations_list:
        compound_name = annotation["Compound_Name"]
        spectrum_identifier = annotation["#Scan#"] + ":" + annotation[
            "SpectrumFile"]

        if compound_name in already_identified_compounds:
            continue
        if spectrum_identifier in already_identified_spectra:
            continue

        print(compound_name, spectrum_identifier)

        output_annotation_list.append(annotation)
        already_identified_compounds.add(compound_name)
        already_identified_spectra.add(spectrum_identifier)

    ming_fileio_library.write_list_dict_table_data(
        output_annotation_list, output_library_identifications)
def add_additional_edges(G, path_to_supplemental_edges):
    edge_list = ming_fileio_library.parse_table_with_headers_object_list(path_to_supplemental_edges, delimiter=",")

    edges_to_add = []

    for additional_edge_row in edge_list:
        try:
            node1 = additional_edge_row["ID1"]
            node2 = additional_edge_row["ID2"]
            
            node1_mz = G.node[node1]["precursor mass"]
            node2_mz = G.node[node2]["precursor mass"]

            mass_difference = float(node1_mz) - float(node2_mz)

            edgetype = additional_edge_row["EdgeType"]
            score = additional_edge_row["Score"]
            annotation = additional_edge_row["Annotation"]

            edge_object = {}
            edge_object["node1"] = node1
            edge_object["node2"] = node2
            edge_object["EdgeType"] = edgetype
            edge_object["EdgeAnnotation"] = annotation.rstrip()
            edge_object["EdgeScore"] = float(score)
            edge_object["mass_difference"] = mass_difference
            edges_to_add.append((node1, node2, edge_object))
        except:
            print("Error Adding Edge")
            continue

    G.add_edges_from(edges_to_add)

    return G
def add_additional_edges(G, path_to_supplemental_edges):
    edge_list = ming_fileio_library.parse_table_with_headers_object_list(
        path_to_supplemental_edges, delimiter=",")

    edges_to_add = []

    for additional_edge_row in edge_list:
        node1 = additional_edge_row["ID1"]
        node2 = additional_edge_row["ID2"]

        edgetype = additional_edge_row["EdgeType"]
        score = additional_edge_row["Score"]
        annotation = additional_edge_row["Annotation"]

        edge_object = {}
        edge_object["node1"] = node1
        edge_object["node2"] = node2
        edge_object["EdgeType"] = edgetype
        edge_object["EdgeAnnotation"] = annotation.rstrip()
        edge_object["EdgeScore"] = float(score)

        edges_to_add.append((node1, node2, edge_object))

    G.add_edges_from(edges_to_add)

    return G
def add_additional_edges(G, path_to_supplemental_edges):
    edge_list = ming_fileio_library.parse_table_with_headers_object_list(path_to_supplemental_edges, delimiter=",")

    edges_to_add = []

    for additional_edge_row in edge_list:
        node1 = additional_edge_row["ID1"]
        node2 = additional_edge_row["ID2"]

        edgetype = additional_edge_row["EdgeType"]
        score = additional_edge_row["Score"]
        annotation = additional_edge_row["Annotation"]

        edge_object = {}
        edge_object["node1"] = node1
        edge_object["node2"] = node2
        edge_object["EdgeType"] = edgetype
        edge_object["EdgeAnnotation"] = annotation.rstrip()
        edge_object["EdgeScore"] = float(score)

        edges_to_add.append((node1, node2, edge_object))

    G.add_edges_from(edges_to_add)

    return G
def main():
    results_filename = sys.argv[1]
    output_filename_unique_files = sys.argv[2]
    output_filename_all_matches = sys.argv[3]

    all_datasets = ming_gnps_library.get_all_datasets(gnps_only=True)
    all_matches = ming_fileio_library.parse_table_with_headers_object_list(
        results_filename)

    output_source_list = []
    output_match_list = []

    MetaDataServerStatus = test_metadata_server()

    for match_object in all_matches:
        dataset_accession = match_object["dataset_id"]
        dataset_scan = match_object["dataset_scan"]

        #output_source_list += trace_filename(all_datasets, dataset_accession, dataset_scan)
        current_filelist, current_match_list = trace_filename_filesystem(
            all_datasets,
            dataset_accession,
            dataset_scan,
            enrichmetadata=MetaDataServerStatus)
        output_source_list += current_filelist
        output_match_list += current_match_list

    ming_fileio_library.write_list_dict_table_data(
        output_source_list, output_filename_unique_files)
    ming_fileio_library.write_list_dict_table_data(
        output_match_list, output_filename_all_matches)
def main():
    parser = argparse.ArgumentParser(description='Creates alan table')
    parser.add_argument('input_clusterinfosummary',
                        help='input_clusterinfosummary')
    parser.add_argument('output_filename', help='output_filename')
    args = parser.parse_args()

    print(args.input_clusterinfosummary)

    data_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_clusterinfosummary)

    all_filenames = []
    for data_object in data_list:
        if "UniqueFileSources" in data_object:
            all_filenames += data_object["UniqueFileSources"].split("|")
        else:
            filenames = list(
                set([
                    filename.split(":")[0]
                    for filename in data_object["AllFiles"].split("###")
                    if len(filename) > 2
                ]))
            all_filenames += filenames

    all_filenames = list(set(all_filenames))

    compounds_to_files = defaultdict(list)
    for data_object in data_list:
        filenames = []
        if "UniqueFileSources" in data_object:
            filenames = data_object["UniqueFileSources"].split("|")
        else:
            filenames = list(
                set([
                    filename.split(":")[0]
                    for filename in data_object["AllFiles"].split("###")
                    if len(filename) > 2
                ]))
        compound_name = data_object["LibraryID"]
        compounds_to_files[compound_name] += filenames

    output_list = []
    for compound_name in compounds_to_files:
        output_dict = {}
        output_dict["LibraryID"] = compound_name
        output_dict["TotalFiles"] = len(compounds_to_files[compound_name])
        for filename in compounds_to_files[compound_name]:
            output_dict[filename] = "1"

        for filename in all_filenames:
            if not filename in output_dict:
                output_dict[filename] = "0"

        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list,
                                                   args.output_filename)
def load_library_id_dict(library_filename):
    results_list = ming_fileio_library.parse_table_with_headers_object_list(library_filename)

    output_dict = {}
    for result_obj in results_list:
        scan = result_obj["#Scan#"]
        output_dict[scan] = result_obj

    return output_dict
def load_library_id_dict(library_filename):
    results_list = ming_fileio_library.parse_table_with_headers_object_list(
        library_filename)

    output_dict = {}
    for result_obj in results_list:
        scan = result_obj["#Scan#"]
        output_dict[scan] = result_obj

    return output_dict
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('input_clusterinfo_summary', help='Input cluster info summary')
    parser.add_argument('input_network_pairs_file', help='network_pairs_file')
    parser.add_argument('input_library_search_file', help='network_pairs_file')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    parser.add_argument('output_component_summary', help='output component file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfo_summary)

    library_ids_dict = load_library_id_dict(args.input_library_search_file)
    nodes_to_component, component_to_nodes = load_pairs_dict(args.input_network_pairs_file)

    for cluster in all_clusterinfo_list:
        cluster_index = cluster["cluster index"]
        if cluster_index in nodes_to_component:
            cluster["componentindex"] = nodes_to_component[cluster_index]
            cluster["GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s" % (nodes_to_component[cluster_index], param_obj["task"][0])
        else:
            cluster["componentindex"] = "-1"
            cluster["GNPSLinkout_Network"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (param_obj["task"][0], cluster_index, cluster_index)

        if cluster_index in library_ids_dict:
            cluster["LibraryID"] = library_ids_dict[cluster_index]["Compound_Name"]
            cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"]
            cluster["SpectrumID"] = library_ids_dict[cluster_index]["SpectrumID"]
        else:
            cluster["LibraryID"] = "N/A"
            cluster["MQScore"] = "N/A"
            cluster["SpectrumID"] = "N/A"

    ming_fileio_library.write_list_dict_table_data(all_clusterinfo_list, args.output_clusterinfo_summary)

    output_component_list = []

    for componentindex in component_to_nodes:
        output_dict = {}
        output_dict["ComponentIndex"] = componentindex
        output_dict["NodeCount"] = len(component_to_nodes[componentindex])
        output_dict["#Spectra"] = len(component_to_nodes[componentindex])
        all_lib_identifications = []
        for node in component_to_nodes[componentindex]:
            if node in library_ids_dict:
                all_lib_identifications.append(library_ids_dict[node]["Compound_Name"])
        output_dict["AllIDs"] = "!".join(all_lib_identifications)
        output_component_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_component_list, args.output_component_summary)
def main():
    results_filename = sys.argv[1]
    output_filename = sys.argv[2]

    input_results = ming_fileio_library.parse_table_with_headers_object_list(
        results_filename)
    output_results = []

    #Check if server is up

    for result_object in input_results:
        filename = result_object["filename"]
        get_metadata_information_per_filename(filename)

    ming_fileio_library.write_list_dict_table_data(output_results,
                                                   output_filename)
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    output_list = []
    for parallel_output_filename in all_intermediate_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            parallel_output_filename)
        output_list += result_list

    ming_fileio_library.write_list_dict_table_data(output_list,
                                                   output_filename)
def populate_network_identifications(cluster_summary_list, library_search_filename):
    clusters_to_identifications = {}
    library_ids_list = ming_fileio_library.parse_table_with_headers_object_list(library_search_filename)
    for library_id in library_ids_list:
        cluster_index = library_id["#Scan#"]
        clusters_to_identifications[cluster_index] = library_id

    fields_to_copy = ["Smiles", "MQScore", "MassDiff", "MZErrorPPM", "SpectrumID", "Smiles"]
    for cluster in cluster_summary_list:
        cluster_index = cluster["cluster index"]
        if cluster_index in clusters_to_identifications:
            cluster["LibraryID"] = clusters_to_identifications[cluster_index]["Compound_Name"]
            for field in fields_to_copy:
                cluster[field] = clusters_to_identifications[cluster_index][field]
        else:
            cluster["LibraryID"] = "N/A"
            for field in fields_to_copy:
                cluster[field] = "N/A"
def load_pairs_dict(pairs_filename):
    results_list = ming_fileio_library.parse_table_with_headers_object_list(pairs_filename)

    node_to_component = {}
    component_to_node = defaultdict(set)

    for result_obj in results_list:
        node1 = result_obj["CLUSTERID1"]
        node2 = result_obj["CLUSTERID2"]
        component = result_obj["ComponentIndex"]

        node_to_component[node1] = component
        node_to_component[node2] = component

        component_to_node[component].add(node1)
        component_to_node[component].add(node2)

    return node_to_component, component_to_node
def populate_network_identifications(cluster_summary_list, library_search_filename):
    clusters_to_identifications = {}
    library_ids_list = ming_fileio_library.parse_table_with_headers_object_list(library_search_filename)
    for library_id in library_ids_list:
        cluster_index = library_id["#Scan#"]
        clusters_to_identifications[cluster_index] = library_id

    fields_to_copy = ["Smiles", "MQScore", "MassDiff", "MZErrorPPM", "SpectrumID", "Smiles"]
    for cluster in cluster_summary_list:
        cluster_index = cluster["cluster index"]
        if cluster_index in clusters_to_identifications:
            cluster["LibraryID"] = clusters_to_identifications[cluster_index]["Compound_Name"]
            for field in fields_to_copy:
                cluster[field] = clusters_to_identifications[cluster_index][field]
        else:
            cluster["LibraryID"] = "N/A"
            for field in fields_to_copy:
                cluster[field] = "N/A"
def load_pairs_dict(pairs_filename):
    results_list = ming_fileio_library.parse_table_with_headers_object_list(
        pairs_filename)

    node_to_component = {}
    component_to_node = defaultdict(set)

    for result_obj in results_list:
        node1 = result_obj["CLUSTERID1"]
        node2 = result_obj["CLUSTERID2"]
        component = result_obj["ComponentIndex"]

        node_to_component[node1] = component
        node_to_component[node2] = component

        component_to_node[component].add(node1)
        component_to_node[component].add(node2)

    return node_to_component, component_to_node
def populate_dataset_metadata(input_metadata_filename):
    Filename.create_table(True)
    Attribute.create_table(True)
    AttributeTerm.create_table(True)
    Compound.create_table(True)
    CompoundFilenameConnection.create_table(True)
    FilenameAttributeConnection.create_table(True)
    CompoundTag.create_table(True)
    CompoundTagFilenameConnection.create_table(True)

    #Check if dataset metadata is in the database already
    included_accessions = []
    # try:
    #     accession_attribute = Attribute.select().where(Attribute.categoryname == "ATTRIBUTE_DatasetAccession")[0]
    #     for joined in FilenameAttributeConnection.select().where(FilenameAttributeConnection.attribute == accession_attribute).group_by(FilenameAttributeConnection.attributeterm):
    #         included_accessions.append(joined.attributeterm.term)
    # except:
    #     print("No Accessions")

    result_list = ming_fileio_library.parse_table_with_headers_object_list(
        input_metadata_filename, "\t")

    metadata_by_accession = defaultdict(list)

    for result in result_list:
        massive_accession = result["MassiveID"]
        metadata_by_accession[massive_accession].append(result)

    total_added_files = 0

    for dataset_accession in metadata_by_accession:
        print("Attempting Import", dataset_accession)
        if dataset_accession in included_accessions:
            print("Skipping %s, already imported" % (dataset_accession))
            continue
        added_files = add_metadata_per_accession(
            dataset_accession, metadata_by_accession[dataset_accession])
        total_added_files += added_files
        print(dataset_accession, len(metadata_by_accession[dataset_accession]),
              added_files)

    return total_added_files
def main():
    parser = argparse.ArgumentParser(description='Creates alan table')
    parser.add_argument('input_identifications_filename',
                        help='input_identifications_filename')
    parser.add_argument('output_filename', help='output_filename')
    args = parser.parse_args()

    print(args.input_identifications_filename)

    data_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_identifications_filename)

    all_filenames = set()
    compounds_to_files = defaultdict(set)
    for data_object in data_list:
        query_filename = "f." + data_object["full_CCMS_path"]
        compound_name = data_object["Compound_Name"]
        all_filenames.add(query_filename)
        compounds_to_files[compound_name].add(query_filename)

    output_list = []
    for compound_name in compounds_to_files:
        output_dict = {}
        output_dict["LibraryID"] = compound_name
        output_dict["TotalFiles"] = len(compounds_to_files[compound_name])
        for filename in compounds_to_files[compound_name]:
            output_dict[filename] = "1"

        for filename in all_filenames:
            if not filename in output_dict:
                output_dict[filename] = "0"

        output_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(output_list,
                                                   args.output_filename)
def main():
    parser = argparse.ArgumentParser(description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file')
    parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file')
    parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename')
    parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename')
    parser.add_argument('input_networking_pairs', help='input_networking_pairs')
    parser.add_argument('input_library_search', help='input_library_search')
    parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename')
    args = parser.parse_args()

    """Loading group filenames"""
    group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename)
    print("Loaded Group Mapping")
    cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file)
    print("Loaded Cluster Summary")

    attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename)

    params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)

    CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0])
    RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0]

    #Calculating the spectrum counts per group
    cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0))
    cluster_to_files = defaultdict(set)
    cluster_to_RT = defaultdict(list)
    line_count = 0
    for line in open(args.input_clusterinfo_file):
        line_count += 1
        if line_count == 1:
            continue
        if line_count % 10000 == 0:
            print(line_count)

        splits = line.rstrip().split("\t")
        cluster_index = splits[0]
        filename = os.path.basename(splits[1])
        rt = float(splits[6])

        group_membership = files_to_groups[filename]
        cluster_to_files[cluster_index].add(filename)
        cluster_to_RT[cluster_index].append(rt)

        for group in group_membership:
            cluster_to_group_counts[cluster_index][group] += 1

    if RUN_MSCLUSTER == "on":
        cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE)

    print(len(cluster_summary_list))

    print("Setting up grouping", len(group_to_files.keys()))
    for cluster_summary_object in cluster_summary_list:
        cluster_index = cluster_summary_object["cluster index"]
        for group in group_to_files:
            group_count = 0
            if group in cluster_to_group_counts[cluster_index]:
                group_count = cluster_to_group_counts[cluster_index][group]
            cluster_summary_object[group] = group_count

        for attribute in attribute_to_groups:
            groups_to_include = []
            for group in attribute_to_groups[attribute]:
                if group in cluster_summary_object:
                    if cluster_summary_object[group] > 0:
                        groups_to_include.append(group)

            cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "")


    print("Default Attributes")
    calculate_default_attributes(cluster_summary_list, group_to_files.keys())

    print("calculate_cluster_file_stats")
    calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping)

    print("rt stats")
    calculate_rt_stats(cluster_summary_list, cluster_to_RT)

    print("calculate_ancillary_information")
    calculate_ancillary_information(cluster_summary_list, params_object["task"][0])

    print("populate_network_component")
    populate_network_component(cluster_summary_list, args.input_networking_pairs)

    print("populate_network_identifications")
    populate_network_identifications(cluster_summary_list, args.input_library_search)

    ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
Example #23
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list
    
    used_files = set()
    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path
        result_object["CCMS_filename"] = os.path.basename(full_path)
        used_files.add(full_path)

    for mangled_name in spectra_files:
        full_path = mangled_mapping[os.path.basename(mangled_name)]
        if full_path in used_files:
            continue

        output_dict = {}
        output_dict["full_CCMS_path"] = full_path
        output_dict["CCMS_filename"] = os.path.basename(full_path)
        full_result_list.append(output_dict)

    pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
Example #24
0
def main():
    parser = argparse.ArgumentParser(description='Modifying script')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_table', help='output_metadata_table')
    parser.add_argument('output_view_emporer', help='output_metadata_table')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(
        open(args.param_xml, "r"))
    """Outputting html"""
    from urllib.parse import urlencode, quote_plus
    parameters_for_qiime = {
        'biom':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom'
        % (param_object["task"][0]),
        'metadata':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt'
        % (param_object["task"][0])
    }

    output_html_file = open(args.output_view_emporer, "w")
    output_html_file.write("<script>\n")
    output_html_file.write(
        'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n'
        % urlencode(parameters_for_qiime))
    output_html_file.write("</script>\n")
    output_html_file.close()

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(
        param_object)

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)

    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename": real_name})
        #open(args.output_metadata_table, "w").write("NO OUTPUT")
        #open(args.output_view_emporer, "w").write("Please Include Metadata File")
        #exit(0)
    else:
        object_list = ming_fileio_library.parse_table_with_headers_object_list(
            metadata_files_in_folder[0])

        if len(object_list) == 0:
            for real_name in reverse_file_mangling:
                mangled_name = reverse_file_mangling[real_name]
                if mangled_name.find("spec") == -1:
                    continue
                object_list.append({"filename": real_name})
            #open(args.output_metadata_table, "w").write("NO OUTPUT")
            #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File")
            #exit(0)

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            metadata_object[
                "#SampleID"] = ming_fileio_library.get_filename_without_extension(
                    metadata_object["filename"])
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        mangled_name = reverse_file_mangling[metadata_object["filename"]]
        if mangled_name.find("spec-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
        elif mangled_name.find("spectwo-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
        elif mangled_name.find("specthree-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
        elif mangled_name.find("specfour-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
        elif mangled_name.find("specfive-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
        elif mangled_name.find("specsix-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"

    ming_fileio_library.write_list_dict_table_data(object_list,
                                                   args.output_metadata_table,
                                                   header_list)
Example #25
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('cluster_buckets', help='cluster_buckets')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_folder', help='output_folder')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r"))

    if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0":
        print("Do not do things")
        exit(0)

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object)

    """Reading Metadata File"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename" : real_name})
    else:
        print(metadata_files_in_folder[0])
        object_list = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0])
        if len(object_list) == 0:
            for real_name in reverse_file_mangling:
                mangled_name = reverse_file_mangling[real_name]
                if mangled_name.find("spec") == -1:
                    continue
                object_list.append({"filename" : real_name})

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            if "#SampleID" in metadata_object:
                metadata_object["#SampleID"] = metadata_object["#SampleID"]
            else:
                #Stripping off all non-alphanumeric characters
                metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum())
        if not "Description" in metadata_object:
            metadata_object["Description"] = "LoremIpsum"
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        try:
            mangled_name = reverse_file_mangling[metadata_object["filename"]]
            if mangled_name.find("spec-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
            elif mangled_name.find("spectwo-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
            elif mangled_name.find("specthree-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
            elif mangled_name.find("specfour-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
            elif mangled_name.find("specfive-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
            elif mangled_name.find("specsix-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"
        except:
            print(metadata_object["filename"], "Not Mapped")
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped"

    output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv")

    for metadatum in object_list:
        if "sample_name" in metadatum:
            if len(metadatum["sample_name"]) > 1:
                metadatum["#SampleID"] = metadatum["sample_name"]

    metadata_df = pd.DataFrame(object_list)
    metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list)

    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = metadata_df["#SampleID"]
    manifest_df["filepath"] = metadata_df["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    """Calling remote server to do the calculation"""
    SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024"
    #SERVER_BASE = "http://mingwangbeta.ucsd.edu:5024"
    files = {'manifest': open(output_manifest_filename, 'r'), \
    'metadata': open(output_metadata_filename, 'r'), \
    'bucket': open(args.cluster_buckets, 'r')}


    r_post = requests.post(SERVER_BASE + "/processclassic", files=files)
    response_dict = r_post.json()

    with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

    with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
def main():
    input_filename = sys.argv[1]
    output_tsv = sys.argv[2]

    results_list = ming_fileio_library.parse_table_with_headers_object_list(
        input_filename)
    results_by_compound_name = defaultdict(list)
    for result in results_list:
        annotation_string = result["Compound_Name"]
        results_by_compound_name[annotation_string].append(result)

    output_results = []
    for compound_name in results_by_compound_name:
        best_result = sorted(results_by_compound_name[compound_name],
                             key=lambda result: float(result["MQScore"]),
                             reverse=True)[0]

        all_RTs = [
            float(result["RT_Query"])
            for result in results_by_compound_name[compound_name]
        ]
        all_MZs = [
            float(result["SpecMZ"])
            for result in results_by_compound_name[compound_name]
        ]
        all_MZ_ppmerror = [
            float(result["MZErrorPPM"])
            for result in results_by_compound_name[compound_name]
        ]

        rt_mean = statistics.mean(all_RTs)
        rt_median = statistics.median(all_RTs)
        mz_mean = statistics.mean(all_MZs)
        mz_ppm_mean = statistics.mean(all_MZ_ppmerror)

        rt_max = max(all_RTs)
        rt_min = min(all_RTs)

        mz_max = max(all_MZs)
        mz_min = min(all_MZs)

        #STDDev
        rt_stdev = 0.0
        mz_stdev = 0.0
        ppmerror_stdev = 0.0
        if len(all_RTs) > 1:
            rt_stdev = statistics.stdev(all_RTs)
            mz_stdev = statistics.stdev(all_MZs)
            ppmerror_stdev = statistics.stdev(all_MZ_ppmerror)

        best_result["rt_mean"] = rt_mean
        best_result["rt_median"] = rt_median
        best_result["mz_mean"] = mz_mean
        best_result["mz_ppm_mean"] = mz_ppm_mean
        best_result["rt_max"] = rt_max
        best_result["rt_min"] = rt_min
        best_result["mz_max"] = mz_max
        best_result["mz_min"] = mz_min
        best_result["rt_stdev"] = rt_stdev
        best_result["mz_stdev"] = mz_stdev
        best_result["ppmerror_stdev"] = ppmerror_stdev
        best_result["number_spectra"] = len(all_RTs)

        output_results.append(best_result)

    ming_fileio_library.write_list_dict_table_data(output_results, output_tsv)
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")


    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
Example #28
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('cluster_buckets', help='cluster_buckets')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_folder', help='output_folder')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r"))

    if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0":
        print("Do not do things")
        exit(0)

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object)

    """Reading Metadata File"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename" : real_name})
    else:
        object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0])
        #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t")

        object_list = []
        for metadata_object in object_list_temp:
            if len(metadata_object["filename"]) > 1:
                object_list.append(metadata_object)
        
        #Adding all files, if analyzed file is not in list
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue

            found = False
            for metadata_object in object_list:
                if os.path.basename(real_name) == metadata_object["filename"]:
                    found = True
                    break

            if found is False:
                object_list.append({"filename" : real_name})

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            if "#SampleID" in metadata_object:
                metadata_object["#SampleID"] = metadata_object["#SampleID"]
            else:
                #Stripping off all non-alphanumeric characters
                #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum())
                metadata_object["#SampleID"] = metadata_object["filename"]
        if not "Description" in metadata_object:
            metadata_object["Description"] = "LoremIpsum"
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        #Adding default grouping information
        try:
            mangled_name = reverse_file_mangling[metadata_object["filename"]]
            if mangled_name.find("spec-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
            elif mangled_name.find("spectwo-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
            elif mangled_name.find("specthree-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
            elif mangled_name.find("specfour-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
            elif mangled_name.find("specfive-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
            elif mangled_name.find("specsix-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"
        except:
            print(metadata_object["filename"], "Not Mapped")
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped"

    output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv")

    for metadatum in object_list:
        if "sample_name" in metadatum:
            if len(metadatum["sample_name"]) > 1:
                metadatum["#SampleID"] = metadatum["sample_name"]


    #Removing metadata filenames that are not in the actual data
    #analysis_files = 

    metadata_df = pd.DataFrame(object_list)
    metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list)

    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = metadata_df["#SampleID"]
    manifest_df["filepath"] = metadata_df["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    """Calling remote server to do the calculation"""
    SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024"
    files = {'manifest': open(output_manifest_filename, 'r'), \
    'metadata': open(output_metadata_filename, 'r'), \
    'bucket': open(args.cluster_buckets, 'r')}

    r_post = requests.post(SERVER_BASE + "/processclassic", files=files)
    response_dict = r_post.json()

    with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

    with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
Example #29
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('consensus_feature_file', help='Consensus Quantification File')
    parser.add_argument('metadata_folder', help='metadata metadata_folder')
    parser.add_argument('mgf_filename', help='mgf_filename')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    task_id = param_obj["task"][0]

    group_to_files_mapping = defaultdict(list)
    attributes_to_groups_mapping = defaultdict(set)

    metadata_files = glob.glob(os.path.join(args.metadata_folder, "*"))
    if len(metadata_files) == 1:
        group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(metadata_files[0])

    ROW_NORMALIZATION = "None"
    try:
        ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0]
    except:
        ROW_NORMALIZATION = "None"

    GROUP_COUNT_AGGREGATE_METHOD = "Sum"
    try:
        GROUP_COUNT_AGGREGATE_METHOD = param_obj["GROUP_COUNT_AGGREGATE_METHOD"][0]
    except:
        GROUP_COUNT_AGGREGATE_METHOD = "None"


    quantification_list = ming_fileio_library.parse_table_with_headers_object_list(args.consensus_feature_file, delimiter=",")
    input_filenames, input_filename_headers = determine_input_files(quantification_list[0].keys())

    ### Filling in Quantification table if it is missing values
    for quantification_object in quantification_list:
        ###Handling empty quantification
        for filename in input_filename_headers:
            try:
                if len(quantification_object[filename]) == 0:
                    #print(filename, quantification_object[filename], quantification_object["row ID"])
                    quantification_object[filename] = 0
            except:
                x = 1

    print("Number of Features", len(quantification_list))

    #Doing row sum normalization
    if ROW_NORMALIZATION == "RowSum":
        print("ROW SUM NORM")
        for filename_header in input_filename_headers:
            file_quants = [float(quantification_object[filename_header]) for quantification_object in quantification_list]
            for quantification_object in quantification_list:
                quantification_object[filename_header] = float(quantification_object[filename_header]) / sum(file_quants)

    """Loading MS2 Spectra"""
    mgf_collection = ming_spectrum_library.SpectrumCollection(args.mgf_filename)
    mgf_collection.load_from_file()

    clusters_list = []
    for quantification_object in quantification_list:

        cluster_obj = {}
        cluster_obj["cluster index"] = quantification_object["row ID"]
        cluster_obj["precursor mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]))
        cluster_obj["RTConsensus"] = "{0:.4f}".format(float(quantification_object["row retention time"]))

        all_charges = []

        """Checking about the charge of this cluster"""
        try:
            spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
            charge = int(spectrum_object.charge)
        except:
            charge = 0

        """Checking if this spectrum has no peaks"""
        # try:
        #     spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
        #
        # except:
        #     continue

        all_files = [os.path.basename(filename) for filename in input_filename_headers if float(quantification_object[filename]) > 0]
        abundance_per_file = [(os.path.basename(filename), float(quantification_object[filename])) for filename in input_filename_headers]
        all_abundances = [float(quantification_object[filename]) for filename in input_filename_headers]

        if charge != 0:
            cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]) * charge - charge + 1)
        else:
            cluster_obj["parent mass"] = "{0:.4f}".format(float(quantification_object["row m/z"]))
        cluster_obj["precursor charge"] = charge

        try:
            cluster_obj["RTMean"] = statistics.mean(all_retention_times)
            cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times)
        except:
            cluster_obj["RTMean"] = cluster_obj["RTConsensus"]
            cluster_obj["RTStdErr"] = 0

        cluster_obj["GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (task_id, quantification_object["row ID"], quantification_object["row ID"])
        #cluster_obj["AllFiles"] = "###".join(all_files)

        cluster_obj["sum(precursor intensity)"] = sum(all_abundances)
        cluster_obj["SumPeakIntensity"] = sum(all_abundances)
        cluster_obj["number of spectra"] = len(all_files)
        cluster_obj["UniqueFileSourcesCount"] = len(all_files)

        group_abundances = determine_group_abundances(group_to_files_mapping, abundance_per_file, operation=GROUP_COUNT_AGGREGATE_METHOD)

        default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"]
        for group in group_to_files_mapping:
            group_header = "GNPSGROUP:" + group
            if group in default_groups:
                continue
            cluster_obj[group_header] = group_abundances[group]

        for group in default_groups:
            cluster_obj[group] = group_abundances[group]

        #Writing attributes
        for attribute in attributes_to_groups_mapping:
            groups_to_include = []
            for group in attributes_to_groups_mapping[attribute]:
                if group_abundances[group] > 0.0:
                    groups_to_include.append(group)
            if len(groups_to_include) == 0:
                cluster_obj[attribute] = ""
            else:
                cluster_obj[attribute] = ",".join(groups_to_include)


        """
        Enriching the cluster info with adduct collapsing information
        """
        enrich_adduct_annotations(cluster_obj, quantification_object)


        clusters_list.append(cluster_obj)

    ming_fileio_library.write_list_dict_table_data(clusters_list, args.output_clusterinfo_summary)
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('input_clusterinfo_summary',
                        help='Input cluster info summary')
    parser.add_argument('input_network_pairs_file', help='network_pairs_file')
    parser.add_argument('input_library_search_file', help='network_pairs_file')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    parser.add_argument('output_component_summary',
                        help='output component file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    all_clusterinfo_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_clusterinfo_summary)

    library_ids_dict = load_library_id_dict(args.input_library_search_file)
    nodes_to_component, component_to_nodes = load_pairs_dict(
        args.input_network_pairs_file)

    for cluster in all_clusterinfo_list:
        cluster_index = cluster["cluster index"]
        if cluster_index in nodes_to_component:
            cluster["componentindex"] = nodes_to_component[cluster_index]
            cluster[
                "GNPSLinkout_Network"] = "https://gnps.ucsd.edu/ProteoSAFe/result.jsp?view=network_displayer&componentindex=%s&task=%s&show=true" % (
                    nodes_to_component[cluster_index], param_obj["task"][0])
        else:
            cluster["componentindex"] = "-1"
            cluster["GNPSLinkout_Network"] = 'This Node is a Singleton'

        if cluster_index in library_ids_dict:
            cluster["LibraryID"] = library_ids_dict[cluster_index][
                "Compound_Name"]
            cluster["MQScore"] = library_ids_dict[cluster_index]["MQScore"]
            cluster["SpectrumID"] = library_ids_dict[cluster_index][
                "SpectrumID"]
        else:
            cluster["LibraryID"] = "N/A"
            cluster["MQScore"] = "N/A"
            cluster["SpectrumID"] = "N/A"

    ming_fileio_library.write_list_dict_table_data(
        all_clusterinfo_list, args.output_clusterinfo_summary)

    output_component_list = []

    for componentindex in component_to_nodes:
        output_dict = {}
        output_dict["ComponentIndex"] = componentindex
        output_dict["NodeCount"] = len(component_to_nodes[componentindex])
        output_dict["#Spectra"] = len(component_to_nodes[componentindex])
        all_lib_identifications = []
        for node in component_to_nodes[componentindex]:
            if node in library_ids_dict:
                all_lib_identifications.append(
                    library_ids_dict[node]["Compound_Name"])
        output_dict["AllIDs"] = "!".join(all_lib_identifications)
        output_component_list.append(output_dict)

    ming_fileio_library.write_list_dict_table_data(
        output_component_list, args.output_component_summary)
Example #32
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('cluster_buckets', help='cluster_buckets')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_folder', help='output_folder')
    parser.add_argument("conda_activate_bin")
    parser.add_argument("conda_environment")
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r"))

    if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0":
        print("Do not do things")
        exit(0)

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object)

    """Reading Metadata File"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename" : real_name})
    else:
        object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0])
        #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t")

        object_list = []
        for metadata_object in object_list_temp:
            if len(metadata_object["filename"]) > 1:
                object_list.append(metadata_object)
        
        #Adding all files, if analyzed file is not in list
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue

            found = False
            for metadata_object in object_list:
                if os.path.basename(real_name) == metadata_object["filename"]:
                    found = True
                    break

            if found is False:
                object_list.append({"filename" : real_name})

    if len(object_list) == 0:
        print("Do not do things, not enough files")
        exit(0)

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            if "#SampleID" in metadata_object:
                metadata_object["#SampleID"] = metadata_object["#SampleID"]
            else:
                #Stripping off all non-alphanumeric characters
                #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum())
                metadata_object["#SampleID"] = metadata_object["filename"]
        if not "Description" in metadata_object:
            metadata_object["Description"] = "LoremIpsum"
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        #Adding default grouping information
        try:
            mangled_name = reverse_file_mangling[metadata_object["filename"]]
            if mangled_name.find("spec-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
            elif mangled_name.find("spectwo-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
            elif mangled_name.find("specthree-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
            elif mangled_name.find("specfour-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
            elif mangled_name.find("specfive-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
            elif mangled_name.find("specsix-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"
        except:
            print(metadata_object["filename"], "Not Mapped")
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped"

    output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv")

    for metadatum in object_list:
        if "sample_name" in metadatum:
            if len(metadatum["sample_name"]) > 1:
                metadatum["#SampleID"] = metadatum["sample_name"]

    metadata_df = pd.DataFrame(object_list)

    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = metadata_df["#SampleID"]
    manifest_df["filepath"] = metadata_df["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    #Removing protected headers
    #metadata_df = metadata_df.drop(columns=["feature", "#SampleID"], errors="ignore")
    metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list)

    #Running Qiime2
    local_qza_table = os.path.join(args.output_folder, "qiime2_table.qza")
    local_qza_distance = os.path.join(args.output_folder, "qiime2_distance.qza")
    local_qza_pcoa = os.path.join(args.output_folder, "qiime2_pcoa.qza")
    local_qzv_emperor = os.path.join(args.output_folder, "qiime2_emperor.qzv")

    all_cmd = []
    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime metabolomics import-gnpsnetworkingclusteringbuckettable \
        --p-manifest {} \
        --p-buckettable {} \
        --o-feature-table {}".format(args.conda_activate_bin, args.conda_environment, output_manifest_filename, args.cluster_buckets, local_qza_table))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity beta \
        --i-table {} \
        --p-metric cosine \
        --o-distance-matrix {}".format(args.conda_activate_bin, args.conda_environment, local_qza_table, local_qza_distance))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity pcoa \
        --i-distance-matrix {} \
        --o-pcoa {}".format(args.conda_activate_bin, args.conda_environment, local_qza_distance, local_qza_pcoa))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime emperor plot \
        --i-pcoa {} \
        --m-metadata-file {} \
        --o-visualization {} \
        --p-ignore-missing-samples".format(args.conda_activate_bin, args.conda_environment, local_qza_pcoa, output_metadata_filename, local_qzv_emperor))

    for cmd in all_cmd:
        os.system(cmd)
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('params_xml', help='params_xml')
    parser.add_argument('consensus_feature_file',
                        help='Consensus Quantification File')
    parser.add_argument('metadata_folder', help='metadata metadata_folder')
    parser.add_argument('mgf_filename', help='mgf_filename')
    parser.add_argument('output_clusterinfo_summary', help='output file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.params_xml))

    task_id = param_obj["task"][0]

    group_to_files_mapping = defaultdict(list)
    attributes_to_groups_mapping = defaultdict(set)

    metadata_files = glob.glob(os.path.join(args.metadata_folder, "*"))
    if len(metadata_files) == 1:
        group_to_files_mapping, attributes_to_groups_mapping = load_group_attribute_mappings(
            metadata_files[0])

    ROW_NORMALIZATION = "None"
    try:
        ROW_NORMALIZATION = param_obj["QUANT_FILE_NORM"][0]
    except:
        ROW_NORMALIZATION = "None"

    GROUP_COUNT_AGGREGATE_METHOD = "Sum"
    try:
        GROUP_COUNT_AGGREGATE_METHOD = param_obj[
            "GROUP_COUNT_AGGREGATE_METHOD"][0]
    except:
        GROUP_COUNT_AGGREGATE_METHOD = "None"

    quantification_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.consensus_feature_file, delimiter=",")
    input_filenames, input_filename_headers = determine_input_files(
        quantification_list[0].keys())

    ### Filling in Quantification table if it is missing values
    for quantification_object in quantification_list:
        ###Handling empty quantification
        for filename in input_filename_headers:
            try:
                if len(quantification_object[filename]) == 0:
                    #print(filename, quantification_object[filename], quantification_object["row ID"])
                    quantification_object[filename] = 0
            except:
                x = 1

    print("Number of Features", len(quantification_list))

    #Doing row sum normalization
    if ROW_NORMALIZATION == "RowSum":
        print("ROW SUM NORM")
        for filename_header in input_filename_headers:
            file_quants = [
                float(quantification_object[filename_header])
                for quantification_object in quantification_list
            ]
            for quantification_object in quantification_list:
                quantification_object[filename_header] = float(
                    quantification_object[filename_header]) / sum(file_quants)
    """Loading MS2 Spectra"""
    mgf_collection = ming_spectrum_library.SpectrumCollection(
        args.mgf_filename)
    mgf_collection.load_from_file()

    clusters_list = []
    for quantification_object in quantification_list:

        cluster_obj = {}
        cluster_obj["cluster index"] = quantification_object["row ID"]
        cluster_obj["precursor mass"] = "{0:.4f}".format(
            float(quantification_object["row m/z"]))
        cluster_obj["RTConsensus"] = "{0:.4f}".format(
            float(quantification_object["row retention time"]))

        all_charges = []
        """Checking about the charge of this cluster"""
        try:
            spectrum_object = mgf_collection.scandict[int(
                cluster_obj["cluster index"])]
            charge = int(spectrum_object.charge)
        except:
            charge = 0
        """Checking if this spectrum has no peaks"""
        # try:
        #     spectrum_object = mgf_collection.scandict[int(cluster_obj["cluster index"])]
        #
        # except:
        #     continue

        all_files = [
            os.path.basename(filename) for filename in input_filename_headers
            if float(quantification_object[filename]) > 0
        ]
        abundance_per_file = [(os.path.basename(filename),
                               float(quantification_object[filename]))
                              for filename in input_filename_headers]
        all_abundances = [
            float(quantification_object[filename])
            for filename in input_filename_headers
        ]

        if charge != 0:
            cluster_obj["parent mass"] = "{0:.4f}".format(
                float(quantification_object["row m/z"]) * charge - charge + 1)
        else:
            cluster_obj["parent mass"] = "{0:.4f}".format(
                float(quantification_object["row m/z"]))
        cluster_obj["precursor charge"] = charge

        try:
            cluster_obj["RTMean"] = statistics.mean(all_retention_times)
            cluster_obj["RTStdErr"] = statistics.stdev(all_retention_times)
        except:
            cluster_obj["RTMean"] = cluster_obj["RTConsensus"]
            cluster_obj["RTStdErr"] = 0

        cluster_obj[
            "GNPSLinkout_Cluster"] = 'https://gnps.ucsd.edu/ProteoSAFe/result.jsp?task=%s&view=view_all_clusters_withID&show=true#{"main.cluster index_lowerinput":"%s","main.cluster index_upperinput":"%s"}' % (
                task_id, quantification_object["row ID"],
                quantification_object["row ID"])
        #cluster_obj["AllFiles"] = "###".join(all_files)

        cluster_obj["sum(precursor intensity)"] = sum(all_abundances)
        cluster_obj["SumPeakIntensity"] = sum(all_abundances)
        cluster_obj["number of spectra"] = len(all_files)
        cluster_obj["UniqueFileSourcesCount"] = len(all_files)

        group_abundances = determine_group_abundances(
            group_to_files_mapping,
            abundance_per_file,
            operation=GROUP_COUNT_AGGREGATE_METHOD)

        default_groups = ["G1", "G2", "G3", "G4", "G5", "G6"]
        for group in group_to_files_mapping:
            group_header = "GNPSGROUP:" + group
            if group in default_groups:
                continue
            cluster_obj[group_header] = group_abundances[group]

        for group in default_groups:
            cluster_obj[group] = group_abundances[group]

        #Writing attributes
        for attribute in attributes_to_groups_mapping:
            groups_to_include = []
            for group in attributes_to_groups_mapping[attribute]:
                if group_abundances[group] > 0.0:
                    groups_to_include.append(group)
            if len(groups_to_include) == 0:
                cluster_obj[attribute] = ""
            else:
                cluster_obj[attribute] = ",".join(groups_to_include)
        """
        Enriching the cluster info with adduct collapsing information
        """
        enrich_adduct_annotations(cluster_obj, quantification_object)

        clusters_list.append(cluster_obj)

    ming_fileio_library.write_list_dict_table_data(
        clusters_list, args.output_clusterinfo_summary)
def trace_filename_filesystem(all_datasets,
                              dataset_accession,
                              dataset_scan,
                              enrichmetadata=False):
    output_file_list = []
    output_match_list = []
    for dataset_object in all_datasets:
        if dataset_object["dataset"] == dataset_accession:
            networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(
                dataset_object["task"])
            if networking_job == None:
                continue

            networking_task_info = ming_proteosafe_library.get_task_information(
                "gnps.ucsd.edu", networking_job["task"])
            task_user = networking_task_info["user"]

            clustering_path = os.path.join(
                "/data/ccms-data/tasks", task_user, networking_job["task"],
                "allclustered_spectra_info_withpath")
            clustering_files = ming_fileio_library.list_files_in_dir(
                clustering_path)
            if len(clustering_files) != 1:
                continue

            clustering_membership_list = ming_fileio_library.parse_table_with_headers_object_list(
                clustering_files[0])

            acceptable_raw_spectra = [
                spectrum for spectrum in clustering_membership_list
                if spectrum["cluster index"] == str(dataset_scan)
            ]

            for raw_spectrum in acceptable_raw_spectra:
                output_object = {}
                output_object["dataset_id"] = dataset_accession
                output_object["cluster_scan"] = dataset_scan
                output_object["filename"] = raw_spectrum["Original_Path"]
                output_object["filescan"] = raw_spectrum["ScanNumber"]
                output_object["metadata"] = ""
                output_object["basefilename"] = os.path.basename(
                    raw_spectrum["Original_Path"])

                if enrichmetadata:
                    try:
                        metadata_list = get_metadata_information_per_filename(
                            raw_spectrum["Original_Path"])
                        output_object["metadata"] = "|".join(metadata_list)
                    except:
                        print("ReDU is down")

                output_match_list.append(output_object)

            print(len(acceptable_raw_spectra))
            unique_files = list(
                set([
                    spectrum["Original_Path"]
                    for spectrum in acceptable_raw_spectra
                ]))
            print(len(unique_files))
            for source_file in unique_files:
                output_object = {}
                output_object["dataset_id"] = dataset_accession
                output_object["cluster_scan"] = dataset_scan
                output_object["filename"] = source_file
                output_object["metadata"] = ""
                output_object["basefilename"] = os.path.basename(source_file)

                if enrichmetadata:
                    try:
                        metadata_list = get_metadata_information_per_filename(
                            source_file)
                        output_object["metadata"] = "|".join(metadata_list)
                    except:
                        print("ReDU is down")

                output_file_list.append(output_object)

    #Performing a fix to make sure the spectrum is present because of a renaming from <dataset>/spectrum to <dataset>/ccms_peak
    for file_dict in output_file_list:
        splits = file_dict["filename"].split("/")
        splits[1] = splits[1].replace("spectrum", "ccms_peak")
        file_dict["filename"] = "/".join(splits)

    for file_dict in output_match_list:
        splits = file_dict["filename"].split("/")
        splits[1] = splits[1].replace("spectrum", "ccms_peak")
        file_dict["filename"] = "/".join(splits)

    return output_file_list, output_match_list
Example #35
0
def main():
    parser = argparse.ArgumentParser(
        description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters',
                        help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary',
                        help='output folder for parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[
        parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)
    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(
        tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(
        full_result_list,
        os.path.join(args.result_folder,
                     str(uuid.uuid4()) + ".tsv"))
Example #36
0
def main():
    parser = argparse.ArgumentParser(
        description='Creates enriched cluster info summary')
    parser.add_argument('param_xml', help='param_xml')
    parser.add_argument('input_clusterinfo_file',
                        help='input_clusterinfo_file')
    parser.add_argument('input_clusterinfosummary_file',
                        help='input_clusterinfosummary_file')
    parser.add_argument('input_group_mapping_filename',
                        help='input_group_mapping_filename')
    parser.add_argument('input_attribute_mapping_filename',
                        help='input_attribute_mapping_filename')
    parser.add_argument('input_networking_pairs',
                        help='input_networking_pairs')
    parser.add_argument('input_library_search', help='input_library_search')
    parser.add_argument('output_clusterinfosummary_filename',
                        help='output_clusterinfosummary_filename')
    args = parser.parse_args()
    """Loading group filenames"""
    group_to_files, files_to_groups = load_group_mapping(
        args.input_group_mapping_filename)
    print("Loaded Group Mapping")
    cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(
        args.input_clusterinfosummary_file)
    print("Loaded Cluster Summary")

    attribute_to_groups = load_attribute_mapping(
        args.input_attribute_mapping_filename)

    params_object = ming_proteosafe_library.parse_xml_file(open(
        args.param_xml))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)

    CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0])
    RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0]

    #Calculating the spectrum counts per group
    cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0))
    cluster_to_files = defaultdict(set)
    cluster_to_RT = defaultdict(list)
    line_count = 0
    for line in open(args.input_clusterinfo_file):
        line_count += 1
        if line_count == 1:
            continue
        if line_count % 10000 == 0:
            print(line_count)

        splits = line.rstrip().split("\t")
        cluster_index = splits[0]
        filename = os.path.basename(splits[1])
        rt = float(splits[6])

        group_membership = files_to_groups[filename]
        cluster_to_files[cluster_index].add(filename)
        cluster_to_RT[cluster_index].append(rt)

        for group in group_membership:
            cluster_to_group_counts[cluster_index][group] += 1

    if RUN_MSCLUSTER == "on":
        cluster_summary_list = filter_clusters_based_on_cluster_size(
            cluster_summary_list, CLUSTER_MIN_SIZE)

    print(len(cluster_summary_list))

    print("Setting up grouping", len(group_to_files.keys()))
    for cluster_summary_object in cluster_summary_list:
        cluster_index = cluster_summary_object["cluster index"]
        for group in group_to_files:
            group_count = 0
            if group in cluster_to_group_counts[cluster_index]:
                group_count = cluster_to_group_counts[cluster_index][group]
            cluster_summary_object[group] = group_count

        for attribute in attribute_to_groups:
            groups_to_include = []
            for group in attribute_to_groups[attribute]:
                if group in cluster_summary_object:
                    if cluster_summary_object[group] > 0:
                        groups_to_include.append(group)

            cluster_summary_object[attribute] = ",".join(
                groups_to_include).replace("GNPSGROUP:", "")

    print("Default Attributes")
    calculate_default_attributes(cluster_summary_list, group_to_files.keys())

    print("calculate_cluster_file_stats")
    calculate_cluster_file_stats(cluster_summary_list, cluster_to_files,
                                 mangled_mapping)

    print("rt stats")
    calculate_rt_stats(cluster_summary_list, cluster_to_RT)

    print("populate_network_component")
    populate_network_component(cluster_summary_list,
                               args.input_networking_pairs)

    print("calculate_ancillary_information")
    calculate_ancillary_information(cluster_summary_list,
                                    params_object["task"][0])

    print("populate_network_identifications")
    populate_network_identifications(cluster_summary_list,
                                     args.input_library_search)

    ming_fileio_library.write_list_dict_table_data(
        cluster_summary_list, args.output_clusterinfosummary_filename)