def main():
    param_filename = sys.argv[1]
    metadata_folder = sys.argv[2]
    input_clusterinfo_file = sys.argv[3]
    input_clusterinfosummary = sys.argv[4]
    ili_stl_model_folder = sys.argv[5]
    output_ili_filename = sys.argv[6]
    view_ili_html_filename = sys.argv[7]

    create_output = True
    param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r"))
    try:
        if param_object["CREATE_ILI_OUTPUT"][0] != "1":
            create_output = False
    except:
        create_output = False

    if create_output:
        ili_stl_model_files_in_folder = ming_fileio_library.list_files_in_dir(ili_stl_model_folder)
        metadata_files_in_folder = ming_fileio_library.list_files_in_dir(metadata_folder)
        if len(metadata_files_in_folder) != 1:
            print("Metadata file not provided, cannot create ili compatible output without coordinates")
            exit(1)
        filename_coordinate_mapping = load_filename_to_coordinate_mapping(metadata_files_in_folder[0])
        create_ili_output_from_clusterinfo(input_clusterinfo_file, param_filename, input_clusterinfosummary, filename_coordinate_mapping, output_ili_filename)

        if len(ili_stl_model_files_in_folder) == 1:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("<script>\n")
            output_ili_html_file.write('window.location.replace("https://ili.embl.de/?https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_stl_model/ili_stl_model-00000.stl;https://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=ili_output/ili_quant.csv")\n' % (param_object["task"][0],param_object["task"][0]))
            output_ili_html_file.write("</script>\n")
            output_ili_html_file.close()

        if len(ili_stl_model_files_in_folder) == 0:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("No STL file uploaded, cannot directly link to ili\n")
            output_ili_html_file.close()

        if len(ili_stl_model_files_in_folder) > 1:
            output_ili_html_file = open(view_ili_html_filename, "w")
            output_ili_html_file.write("Too many stl files uploaded\n")
            output_ili_html_file.close()
    else:
        open(output_ili_filename, "w").write("No Output")
        open(view_ili_html_filename, "w").write("ili output was not selected or no metadata file was provided")
def retreive_proteosafe_backend_task_directory_file(task_id, servername, source_folder_name, target_file):
    proteosafe_data_path = "/data/" + servername + "/tasks/"
    source_folder_path = os.path.join(proteosafe_data_path, task_id, source_folder_name)
    source_files = ming_fileio_library.list_files_in_dir(source_folder_path)

    if len(source_files) == 1:
        #Can Copy
        source_file = os.path.join(source_files, source_files[0])
        print("Copying from " + source_file + " to " + target_file)
        shutil.copyfile(source_file, target_file)
def get_proteosafe_result_file_path(task_id, username, source_folder_name):
    proteosafe_data_path = "/data/ccms-data/tasks/"
    source_folder_path = os.path.join(proteosafe_data_path, username, task_id, source_folder_name)

    if not ming_fileio_library.is_path_present(source_folder_path):
        return []

    source_files = ming_fileio_library.list_files_in_dir(source_folder_path)

    return source_files
def get_proteosafe_backend_result_file_path(task_id, source_folder_name, site):
    proteosafe_data_path = "/data/"
    if site == "proteomics2":
        proteosafe_data_path += "beta-proteomics2"
    source_folder_path = os.path.join(proteosafe_data_path, "tasks", task_id, source_folder_name)
    if not ming_fileio_library.is_path_present(source_folder_path):
        return []

    source_files = ming_fileio_library.list_files_in_dir(source_folder_path)

    return source_files
Esempio n. 5
0
def retreive_proteosafe_backend_task_directory_file(task_id, servername,
                                                    source_folder_name,
                                                    target_file):
    proteosafe_data_path = "/data/" + servername + "/tasks/"
    source_folder_path = os.path.join(proteosafe_data_path, task_id,
                                      source_folder_name)
    source_files = ming_fileio_library.list_files_in_dir(source_folder_path)

    if len(source_files) == 1:
        #Can Copy
        source_file = os.path.join(source_files, source_files[0])
        print("Copying from " + source_file + " to " + target_file)
        shutil.copyfile(source_file, target_file)
Esempio n. 6
0
def main():
    input_folder_path = sys.argv[1]
    output_tsv = sys.argv[2]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)

    merged_dict = defaultdict(list)

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file)
        for key in table_data:
            merged_dict[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def determine_filetype_of_import(input_folder):
    input_filenames = ming_fileio_library.list_files_in_dir(input_folder)
    ext = ming_fileio_library.get_filename_extension(input_filenames[0])

    if ext.upper() == ".CDF":
        return "netcdf"

    if ext.upper() == ".MZXML":
        return "mzxml"

    if ext.upper() == ".MZML":
        return "mzml"

    print("Unsupported extension")
    exit(1)
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder)
    output_map = defaultdict(list)
    for parallel_output_filename in all_intermediate_files:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(parallel_output_filename)
        for key in table_data:
            output_map[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(output_map, output_filename)
def main():
    input_folder_path = sys.argv[1]
    param_xml_filename = sys.argv[2]
    output_tsv = sys.argv[3]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)
    params_obj = ming_proteosafe_library.parse_xml_file(open(param_xml_filename))

    top_k = 1
    try:
        top_k = int(params_obj["TOP_K_RESULTS"][0])
    except:
        top_k = 1

    #merged_dict = defaultdict(list)
    merged_results = []

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(input_file)
        for i in range(row_count):
            result_dict = {}
            for key in table_data:
                result_dict[key] = table_data[key][i]
            merged_results.append(result_dict)


    results_per_spectrum = defaultdict(list)

    for result_obj in merged_results:
        spectrum_unique_key = result_obj["SpectrumFile"] + "___" + result_obj["#Scan#"]

        results_per_spectrum[spectrum_unique_key].append(result_obj)

    output_results = []
    for spectrum_unique_key in results_per_spectrum:
        sorted_results = sorted(results_per_spectrum[spectrum_unique_key], key=lambda spectrum_obj: float(spectrum_obj["MQScore"]), reverse=True)
        filtered_results = sorted_results[:top_k]
        output_results += filtered_results

    output_dict = defaultdict(list)

    for result_obj in output_results:
        for key in result_obj:
            output_dict[key].append(result_obj[key])


    ming_fileio_library.write_dictionary_table_data(output_dict, output_tsv)
Esempio n. 10
0
def load_metadata_mapping(metadata_folder):
    file_name_to_sample_id_mapping = {}
    all_files = ming_fileio_library.list_files_in_dir(metadata_folder)

    if len(all_files) != 1:
        return {}

    row_count, table_data = ming_fileio_library.parse_table_with_headers(all_files[0])

    for i in range(row_count):
        filename = table_data["filename"][i]
        sample_id = table_data["#SampleID"][i]

        file_name_to_sample_id_mapping[filename] = sample_id

    return file_name_to_sample_id_mapping
Esempio n. 11
0
def load_collision_energy_mapping(input_folder):
    scan_maps = {}

    all_files = ming_fileio_library.list_files_in_dir(input_folder)
    for input_file in all_files:
        print(input_file)
        list_of_metadata = json.loads(open(input_file).read())
        for metadata in list_of_metadata:
            filename = metadata["filename"]
            scan = metadata["scan"]
            collision_energy = metadata["scan"]

            key = filename + ":" + str(scan)
            scan_maps[key] = metadata

    return scan_maps
Esempio n. 12
0
def main():
    input_folder_path = sys.argv[1]
    output_tsv = sys.argv[2]

    files = ming_fileio_library.list_files_in_dir(input_folder_path)

    merged_dict = defaultdict(list)

    for input_file in files:
        print("loading", input_file)
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            input_file)
        for key in table_data:
            merged_dict[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(merged_dict, output_tsv)
def main():
    input_pairs = sys.argv[1]

    #Doing other filtering
    G = molecular_network_filtering_library.loading_network(input_pairs, hasHeaders=True)
    molecular_network_filtering_library.add_clusterinfo_summary_to_graph(G, sys.argv[2])
    molecular_network_filtering_library.add_library_search_results_to_graph(G, sys.argv[3])

    folder_for_additional_pairs = sys.argv[4]
    all_pairs_files = ming_fileio_library.list_files_in_dir(folder_for_additional_pairs)
    for additional_pairs_file in all_pairs_files:
        print("Adding Additional Edges", additional_pairs_file)
        molecular_network_filtering_library.add_additional_edges(G, additional_pairs_file)


    nx.write_graphml(G, sys.argv[5], infer_numeric_types=True)
Esempio n. 14
0
def find_matches_in_dataset(dataset_id, input_spectrum_collection,
                            identification_map):
    dataset_match_list = []
    path_to_peak_collection = os.path.join(PATH_TO_DATASET_UPLOADS, dataset_id,
                                           "peak")
    peak_files = ming_fileio_library.list_files_in_dir(path_to_peak_collection)

    for input_file in peak_files:
        print(input_file)
        relative_user_path_to_file = os.path.relpath(input_file,
                                                     PATH_TO_DATASET_UPLOADS)
        reference_spectra = ming_spectrum_library.SpectrumCollection(
            input_file)
        reference_spectra.load_from_mzXML(drop_ms1=True)

        is_blank = 0
        if input_file.find("blank") != -1:
            is_blank = 1

        for myspectrum in input_spectrum_collection.spectrum_list:

            match_list = reference_spectra.search_spectrum(
                myspectrum, 1.0, 1.0, 4, 0.7, 1)
            for match in match_list:
                match_obj = {}
                match_obj["filename"] = relative_user_path_to_file
                match_obj["scan"] = match.scan
                match_obj["score"] = match.score
                match_obj["query_filename"] = match.query_filename
                match_obj["query_scan"] = match.query_scan
                match_obj["ppm_error"] = match.ppm_error
                match_obj["is_blank"] = is_blank
                match_obj["dataset_id"] = dataset_id

                #compound identification
                if match.scan in identification_map:
                    match_obj["identification"] = identification_map[
                        match.scan]["identification"]
                    match_obj["spectrum_id"] = identification_map[
                        match.scan]["spectrum_id"]
                else:
                    match_obj["identification"] = ""
                    match_obj["spectrum_id"] = ""

                dataset_match_list.append(match_obj)

    return dataset_match_list
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    output_list = []
    for parallel_output_filename in all_intermediate_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            parallel_output_filename)
        output_list += result_list

    ming_fileio_library.write_list_dict_table_data(output_list,
                                                   output_filename)
def main():
    input_folder = sys.argv[1]
    output_filename_folder = sys.argv[2]

    input_files = ming_fileio_library.list_files_in_dir(input_folder)

    extension = os.path.split(input_files[0])[1]
    output_filename = os.path.join(output_filename_folder,
                                   "merged" + extension)
    output_file = open(output_filename, "w")

    for input_file in input_files:
        for line in open(input_file):
            output_file.write(line)

        output_file.write("\n")

    output_file.close()
def main():
    input_intermediate_folder = sys.argv[1]
    output_filename = sys.argv[2]

    all_protein_stats = {}

    #Creating a command line for each partition
    all_intermediate_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    output_map = defaultdict(list)
    for parallel_output_filename in all_intermediate_files:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            parallel_output_filename)
        for key in table_data:
            output_map[key] += table_data[key]

    ming_fileio_library.write_dictionary_table_data(output_map,
                                                    output_filename)
Esempio n. 18
0
def main():
    input_intermediate_file = sys.argv[1]
    output_tsv_folder = sys.argv[2]
    output_mgf_folder = sys.argv[3]
    output_sptxt_folder = sys.argv[4]

    all_input_files = ming_fileio_library.list_files_in_dir(input_intermediate_folder)

    library_spectrum_collection = ming_spectrum_library.SpectrumCollection("library spectra")

    all_json_spectra_list = json.load(open(input_intermediate_file))
    print("Loaded", input_intermediate_file, len(all_json_spectra_list))
    for library_spectrum in list_of_library_spectra:
        lib_spec = ming_spectrum_library.PeptideLibrarySpectrum("", 0, 0, library_spectrum["peaks"], library_spectrum["mz"], library_spectrum["charge"], library_spectrum["annotation"], library_spectrum["protein"])
        if "score" in library_spectrum:
            lib_spec.score = library_spectrum["score"]
        if "variant_score" in library_spectrum:
            lib_spec.variant_score = library_spectrum["variant_score"]
        if "spectra_to_consider" in library_spectrum:
            lib_spec.num_spectra = library_spectrum["spectra_to_consider"]
        if "ranking" in library_spectrum:
            lib_spec.spectrum_ranking = library_spectrum["ranking"]
        if "proteosafe_task" in library_spectrum:
            lib_spec.proteosafe_task = library_spectrum["proteosafe_task"]
        if "originalspectrum_filename" in library_spectrum:
            lib_spec.originalfile_filename = library_spectrum["originalspectrum_filename"]
        if "originalspectrum_scan" in library_spectrum:
            lib_spec.originalfile_scan = str(library_spectrum["originalspectrum_scan"])

        library_spectrum_collection.spectrum_list.append(lib_spec)

    output_mgf_filename = os.path.join(output_mgf_folder, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".mgf")
    output_tsv_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".tsv")
    output_sptxt_filename = os.path.join(output_tsv_filename, os.path.splitext(os.path.basename(input_intermediate_file))[0] + ".sptxt")

    library_spectrum_collection_split.save_to_mgf(open(output_mgf_filename, "w"))
    library_spectrum_collection_split.save_to_tsv(open(output_tsv_filename, "w"), output_mgf_filename)

    try:
        library_spectrum_collection.save_to_sptxt(open(output_sptxt_filename, "w"))
    except:
        traceback.print_exc(file=sys.stdout)
        print("MEH")
def main():
    parallel_json = json.loads(open(sys.argv[1]).read())
    params_filename = sys.argv[2]
    input_folder_of_results = sys.argv[3]
    output_folder = sys.argv[4]

    my_node = parallel_json["node_partition"]
    total_node = parallel_json["total_paritions"]

    all_input_files = ming_fileio_library.list_files_in_dir(input_folder_of_results)
    all_input_files.sort()

    ###
    ### TODO We will have to read parameters and see if we need to eliminate some PSMs, with PSM FDR filter, KL Filter, ambiguity score filter, unique intensity filter
    ###

    params_obj = ming_proteosafe_library.parse_xml_file(open(params_filename))
    total_file_count = 0
    all_input_files = all_input_files[my_node::total_node]
    current_working_psm_set = ming_psm_library.PSMset("Ming")

    for input_file in all_input_files:
        #Assume these are variant files
        #We can treat this like a psm file and then combine all of the as a new variants file
        total_file_count += 1
        print(input_file, total_file_count, "of", len(all_input_files))
        input_pickle = open(input_file, 'rb')
        temp_psm_set = pickle.load(input_pickle)
        print("Loaded", len(temp_psm_set.psms))

        for psm in temp_psm_set.psms:
            precursor_string = "%s:%d" % (psm.annotation, psm.charge)
            score = psm.score

            #Determine minimum score cutoff
            current_score = psm.sorting_value()
            peptide_length = len(psm.get_stripped_sequence())

            current_working_psm_set.psms.append(psm)

    #Saving out psms
    output_filename = os.path.join(output_folder, str(my_node) + ".psms")
    current_working_psm_set.write_output(open(output_filename, "w"), True)
def main():
    input_folder = sys.argv[1]

    input_protein_fdr_filename = sys.argv[2]
    input_peptide_protein_mapping_filename = sys.argv[3]

    precursor_to_protein_map = load_precursor_to_protein_mapping(
        input_peptide_protein_mapping_filename)

    output_mgf_folder = sys.argv[4]
    output_tsv_folder = sys.argv[5]

    output_filename_prefix = sys.argv[6]

    input_files = ming_fileio_library.list_files_in_dir(input_folder)

    all_library_spectra = []
    for input_filename in input_files:
        temp_spectra = ming_spectrum_library.load_mgf_peptide_library(
            input_filename)
        print("loaded ", len(temp_spectra), "from", input_filename)
        for spectrum in temp_spectra:
            peptide = spectrum.peptide
            protein = spectrum.protein
            if protein == "CREATION_FALSE_PROTEIN":
                continue
            spectrum.protein = precursor_to_protein_map[peptide]
        all_library_spectra += temp_spectra

    library_spectrum_collection_split = ming_spectrum_library.SpectrumCollection(
        "library spectra")
    library_spectrum_collection_split.spectrum_list = all_library_spectra

    output_tsv_filename = os.path.join(output_tsv_folder,
                                       output_filename_prefix + ".tsv")
    output_mgf_filename = os.path.join(output_mgf_folder,
                                       output_filename_prefix + ".mgf")

    library_spectrum_collection_split.save_to_mgf(
        open(output_mgf_filename, "w"))
    library_spectrum_collection_split.save_to_tsv(
        open(output_tsv_filename, "w"), output_mgf_filename)
def main():
    input_files_list = ming_fileio_library.list_files_in_dir(sys.argv[1])

    output_dict = defaultdict(list)
    output_file = open(sys.argv[2], "w")

    file_count = 0
    for input_file in input_files_list:
        row_count = 0
        for line in open(input_file):
            if file_count == 0 and row_count == 0:
                output_file.write(line)
            elif row_count != 0:
                output_file.write(line)

            row_count += 1

        file_count += 1

    output_file.close()
Esempio n. 22
0
def main():
    input_pairs = sys.argv[1]

    #Doing other filtering
    G = molecular_network_filtering_library.loading_network(input_pairs,
                                                            hasHeaders=True)
    molecular_network_filtering_library.add_clusterinfo_summary_to_graph(
        G, sys.argv[2])
    molecular_network_filtering_library.add_library_search_results_to_graph(
        G, sys.argv[3])

    folder_for_additional_pairs = sys.argv[4]
    all_pairs_files = ming_fileio_library.list_files_in_dir(
        folder_for_additional_pairs)
    for additional_pairs_file in all_pairs_files:
        print("Adding Additional Edges", additional_pairs_file)
        molecular_network_filtering_library.add_additional_edges(
            G, additional_pairs_file)

    nx.write_graphml(G, sys.argv[5], infer_numeric_types=True)
def main():
    input_files_list = ming_fileio_library.list_files_in_dir(sys.argv[1])

    output_dict = defaultdict(list)
    output_file = open(sys.argv[2], "w")

    file_count = 0
    for input_file in input_files_list:
        row_count = 0
        for line in open(input_file):
            if file_count == 0 and row_count == 0:
                output_file.write(line)
            elif row_count != 0:
                output_file.write(line)

            row_count += 1

        file_count += 1

    output_file.close()
Esempio n. 24
0
def main():
    input_intermediate_folder = sys.argv[1]
    output_file = sys.argv[2]

    output_dict = defaultdict(list)

    total_rows = 0
    input_filenames = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    for input_filename in input_filenames:
        if total_rows > 10000000:
            continue

        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            input_filename)
        total_rows += row_count
        for i in range(row_count):
            for key in table_data:
                output_dict[key].append(table_data[key][i])

    ming_fileio_library.write_dictionary_table_data(output_dict, output_file)
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('library_folder', help='Input mgf file to network')
    parser.add_argument('workflow_parameters', help='proteosafe xml parameters')
    parser.add_argument('parameters_output_folder', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))

    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)

    for i in range(args.parallelism):
        output_parameter_file = open(os.path.join(args.parameters_output_folder, str(i) + ".params"), "w")
        #Search Criteria
        output_parameter_file.write("MIN_MATCHED_PEAKS=%s\n" % (params_object["MIN_MATCHED_PEAKS"][0]))
        output_parameter_file.write("TOP_K_RESULTS=%s\n" % (params_object["TOP_K_RESULTS"][0]))
        output_parameter_file.write("search_peak_tolerance=%s\n" % (params_object["tolerance.Ion_tolerance"][0]))
        output_parameter_file.write("search_parentmass_tolerance=%s\n" % (params_object["tolerance.PM_tolerance"][0]))
        output_parameter_file.write("ANALOG_SEARCH=%s\n" % (params_object["ANALOG_SEARCH"][0]))
        output_parameter_file.write("MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0]))
        output_parameter_file.write("SEARCH_LIBQUALITY=%s\n" % (params_object["SEARCH_LIBQUALITY"][0]))

        #Filtering Criteria
        output_parameter_file.write("FILTER_PRECURSOR_WINDOW=%s\n" % (params_object["FILTER_PRECURSOR_WINDOW"][0]))
        output_parameter_file.write("MIN_PEAK_INT=%s\n" % (params_object["MIN_PEAK_INT"][0]))
        output_parameter_file.write("WINDOW_FILTER=%s\n" % (params_object["WINDOW_FILTER"][0]))
        output_parameter_file.write("FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0]))

        output_parameter_file.write("NODEIDX=%d\n" % (i))
        output_parameter_file.write("NODECOUNT=%d\n" % (args.parallelism))


        #For GC
        output_parameter_file.write("FORCE_EXACT_MATCH=%s\n" % (params_object["FORCE_EXACT_MATCH"][0]))

        #Libraries
        output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" % (" ".join(library_files)))

        output_parameter_file.close()
def determine_filenames_to_load(my_node_number, total_parallel,
                                path_to_merged_library_spectra):
    merged_library_filename = ""

    merged_library_files = ming_fileio_library.list_files_in_dir(
        path_to_merged_library_spectra)

    total_number_of_json_files = len(merged_library_files)

    json_file_number_to_load = my_node_number % total_number_of_json_files

    merged_library_filename = os.path.join(
        path_to_merged_library_spectra,
        str(json_file_number_to_load) + ".json")

    total_nodes_for_file = int(
        float(total_parallel) / float(total_number_of_json_files))
    if total_parallel % total_number_of_json_files > my_node_number % total_number_of_json_files:
        total_nodes_for_file += 1

    my_position_for_file = int(
        float(my_node_number) / float(total_number_of_json_files))

    return merged_library_filename, my_position_for_file, total_nodes_for_file
Esempio n. 27
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('cluster_buckets', help='cluster_buckets')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_folder', help='output_folder')
    parser.add_argument("conda_activate_bin")
    parser.add_argument("conda_environment")
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r"))

    if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0":
        print("Do not do things")
        exit(0)

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object)

    """Reading Metadata File"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename" : real_name})
    else:
        object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0])
        #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t")

        object_list = []
        for metadata_object in object_list_temp:
            if len(metadata_object["filename"]) > 1:
                object_list.append(metadata_object)
        
        #Adding all files, if analyzed file is not in list
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue

            found = False
            for metadata_object in object_list:
                if os.path.basename(real_name) == metadata_object["filename"]:
                    found = True
                    break

            if found is False:
                object_list.append({"filename" : real_name})

    if len(object_list) == 0:
        print("Do not do things, not enough files")
        exit(0)

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            if "#SampleID" in metadata_object:
                metadata_object["#SampleID"] = metadata_object["#SampleID"]
            else:
                #Stripping off all non-alphanumeric characters
                #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum())
                metadata_object["#SampleID"] = metadata_object["filename"]
        if not "Description" in metadata_object:
            metadata_object["Description"] = "LoremIpsum"
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        #Adding default grouping information
        try:
            mangled_name = reverse_file_mangling[metadata_object["filename"]]
            if mangled_name.find("spec-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
            elif mangled_name.find("spectwo-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
            elif mangled_name.find("specthree-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
            elif mangled_name.find("specfour-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
            elif mangled_name.find("specfive-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
            elif mangled_name.find("specsix-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"
        except:
            print(metadata_object["filename"], "Not Mapped")
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped"

    output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv")

    for metadatum in object_list:
        if "sample_name" in metadatum:
            if len(metadatum["sample_name"]) > 1:
                metadatum["#SampleID"] = metadatum["sample_name"]

    metadata_df = pd.DataFrame(object_list)

    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = metadata_df["#SampleID"]
    manifest_df["filepath"] = metadata_df["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    #Removing protected headers
    #metadata_df = metadata_df.drop(columns=["feature", "#SampleID"], errors="ignore")
    metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list)

    #Running Qiime2
    local_qza_table = os.path.join(args.output_folder, "qiime2_table.qza")
    local_qza_distance = os.path.join(args.output_folder, "qiime2_distance.qza")
    local_qza_pcoa = os.path.join(args.output_folder, "qiime2_pcoa.qza")
    local_qzv_emperor = os.path.join(args.output_folder, "qiime2_emperor.qzv")

    all_cmd = []
    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime metabolomics import-gnpsnetworkingclusteringbuckettable \
        --p-manifest {} \
        --p-buckettable {} \
        --o-feature-table {}".format(args.conda_activate_bin, args.conda_environment, output_manifest_filename, args.cluster_buckets, local_qza_table))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity beta \
        --i-table {} \
        --p-metric cosine \
        --o-distance-matrix {}".format(args.conda_activate_bin, args.conda_environment, local_qza_table, local_qza_distance))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime diversity pcoa \
        --i-distance-matrix {} \
        --o-pcoa {}".format(args.conda_activate_bin, args.conda_environment, local_qza_distance, local_qza_pcoa))

    all_cmd.append("LC_ALL=en_US && export LC_ALL && source {} {} && \
        qiime emperor plot \
        --i-pcoa {} \
        --m-metadata-file {} \
        --o-visualization {} \
        --p-ignore-missing-samples".format(args.conda_activate_bin, args.conda_environment, local_qza_pcoa, output_metadata_filename, local_qzv_emperor))

    for cmd in all_cmd:
        os.system(cmd)
def trace_filename_filesystem(all_datasets,
                              dataset_accession,
                              dataset_scan,
                              enrichmetadata=False):
    output_file_list = []
    output_match_list = []
    for dataset_object in all_datasets:
        if dataset_object["dataset"] == dataset_accession:
            networking_job = ming_gnps_library.get_most_recent_continuous_networking_of_dataset(
                dataset_object["task"])
            if networking_job == None:
                continue

            networking_task_info = ming_proteosafe_library.get_task_information(
                "gnps.ucsd.edu", networking_job["task"])
            task_user = networking_task_info["user"]

            clustering_path = os.path.join(
                "/data/ccms-data/tasks", task_user, networking_job["task"],
                "allclustered_spectra_info_withpath")
            clustering_files = ming_fileio_library.list_files_in_dir(
                clustering_path)
            if len(clustering_files) != 1:
                continue

            clustering_membership_list = ming_fileio_library.parse_table_with_headers_object_list(
                clustering_files[0])

            acceptable_raw_spectra = [
                spectrum for spectrum in clustering_membership_list
                if spectrum["cluster index"] == str(dataset_scan)
            ]

            for raw_spectrum in acceptable_raw_spectra:
                output_object = {}
                output_object["dataset_id"] = dataset_accession
                output_object["cluster_scan"] = dataset_scan
                output_object["filename"] = raw_spectrum["Original_Path"]
                output_object["filescan"] = raw_spectrum["ScanNumber"]
                output_object["metadata"] = ""
                output_object["basefilename"] = os.path.basename(
                    raw_spectrum["Original_Path"])

                if enrichmetadata:
                    try:
                        metadata_list = get_metadata_information_per_filename(
                            raw_spectrum["Original_Path"])
                        output_object["metadata"] = "|".join(metadata_list)
                    except:
                        print("ReDU is down")

                output_match_list.append(output_object)

            print(len(acceptable_raw_spectra))
            unique_files = list(
                set([
                    spectrum["Original_Path"]
                    for spectrum in acceptable_raw_spectra
                ]))
            print(len(unique_files))
            for source_file in unique_files:
                output_object = {}
                output_object["dataset_id"] = dataset_accession
                output_object["cluster_scan"] = dataset_scan
                output_object["filename"] = source_file
                output_object["metadata"] = ""
                output_object["basefilename"] = os.path.basename(source_file)

                if enrichmetadata:
                    try:
                        metadata_list = get_metadata_information_per_filename(
                            source_file)
                        output_object["metadata"] = "|".join(metadata_list)
                    except:
                        print("ReDU is down")

                output_file_list.append(output_object)

    #Performing a fix to make sure the spectrum is present because of a renaming from <dataset>/spectrum to <dataset>/ccms_peak
    for file_dict in output_file_list:
        splits = file_dict["filename"].split("/")
        splits[1] = splits[1].replace("spectrum", "ccms_peak")
        file_dict["filename"] = "/".join(splits)

    for file_dict in output_match_list:
        splits = file_dict["filename"].split("/")
        splits[1] = splits[1].replace("spectrum", "ccms_peak")
        file_dict["filename"] = "/".join(splits)

    return output_file_list, output_match_list
Esempio n. 29
0
def main():
    input_json = json.loads(open(sys.argv[1]).read())
    input_intermediate_folder = sys.argv[2]
    output_folder = sys.argv[3]
    output_peptide_list_folder = sys.argv[4]

    my_node = input_json["node_partition"]

    output_filename = os.path.join(output_folder, str(my_node) + ".json")
    output_file = open(output_filename, "w")
    number_of_spectra = 0

    input_json_files = ming_fileio_library.list_files_in_dir(
        input_intermediate_folder)
    input_json_files.sort()

    all_spectra = []

    for json_filename in input_json_files:
        #Skip files
        json_basename = os.path.basename(json_filename).split(".")[0]
        bin_peptide = int(json_basename.split("_")[2])
        if bin_peptide != my_node:
            continue

        print("Loading", json_filename)
        spectrum_list = json.load(open(json_filename))
        all_spectra += spectrum_list
        print("Total Spectra", len(spectrum_list), len(all_spectra))

    peptide_dict = defaultdict(list)
    print("Creating hash")
    for spectrum in all_spectra:
        annotation = spectrum["annotation"] + "." + str(spectrum["charge"])
        peptide_dict[annotation].append(spectrum)

    print("writing out strings")
    all_annotation = list(peptide_dict.keys())
    all_annotation.sort()
    for annotation in all_annotation:
        output_file.write(json.dumps(peptide_dict[annotation]))
        output_file.write("\n")

    output_file.close()

    #Write out all the peptides into a file
    output_peptide_dict = defaultdict(list)
    for annotation_key in peptide_dict:
        max_score = -10
        if len(peptide_dict[annotation_key]) > 0:
            for spectrum in peptide_dict[annotation_key]:
                max_score = max(spectrum["score"], max_score)
            #max score per peptide
            output_peptide_dict["score"].append(max_score)
            output_peptide_dict["annotation_key"].append(annotation_key)
            output_peptide_dict["annotation"].append(
                peptide_dict[annotation_key][0]["annotation"])
            output_peptide_dict["charge"].append(
                peptide_dict[annotation_key][0]["charge"])
            output_peptide_dict["protein"].append(
                peptide_dict[annotation_key][0]["protein"])

    #writing out file
    output_peptide_filename = os.path.join(output_peptide_list_folder,
                                           str(my_node) + ".tsv")
    ming_fileio_library.write_dictionary_table_data(output_peptide_dict,
                                                    output_peptide_filename)
def main():
    parser = argparse.ArgumentParser(
        description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(
        open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(
                mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(
                mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)

    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(
                file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data,
                                                    args.output_metadata_file)
Esempio n. 31
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list
    
    used_files = set()
    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path
        result_object["CCMS_filename"] = os.path.basename(full_path)
        used_files.add(full_path)

    for mangled_name in spectra_files:
        full_path = mangled_mapping[os.path.basename(mangled_name)]
        if full_path in used_files:
            continue

        output_dict = {}
        output_dict["full_CCMS_path"] = full_path
        output_dict["CCMS_filename"] = os.path.basename(full_path)
        full_result_list.append(output_dict)

    pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")


    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
Esempio n. 33
0
def main():
    parser = argparse.ArgumentParser(
        description='Group Mapping from input, defaults and metadata file')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('groupmapping_folder', help='groupmapping_folder')
    parser.add_argument('attributemapping_folder',
                        help='attributemapping_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_groupmapping_file',
                        help='output_groupmapping_file')
    parser.add_argument('output_attributemapping_file',
                        help='output_attributemapping_file')
    parser.add_argument('inputspectrafolder', help='inputspectrafolder')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(
        open(args.proteosafe_parameters))
    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        param_obj)
    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(
        param_obj)
    print(reverse_file_mangling.keys())
    file_path_prefix = args.inputspectrafolder

    output_group_file = open(args.output_groupmapping_file, "w")
    output_attribute_file = open(args.output_attributemapping_file, "w")
    """
    Writing Default Grouping to output file
    """
    default_groupings = {
        'G1': [],
        'G2': [],
        'G3': [],
        'G4': [],
        'G5': [],
        'G6': []
    }
    for mangled_name in mangled_file_mapping.keys():
        if mangled_name.find("spec-") != -1:
            default_groupings['G1'].append(mangled_name.rstrip())
        if mangled_name.find("spectwo-") != -1:
            default_groupings['G2'].append(mangled_name.rstrip())
        if mangled_name.find("specthree-") != -1:
            default_groupings['G3'].append(mangled_name.rstrip())
        if mangled_name.find("specfour-") != -1:
            default_groupings['G4'].append(mangled_name.rstrip())
        if mangled_name.find("specfive-") != -1:
            default_groupings['G5'].append(mangled_name.rstrip())
        if mangled_name.find("specsix-") != -1:
            default_groupings['G6'].append(mangled_name.rstrip())

    for default_group_key in default_groupings.keys():
        default_group_string = ""
        default_group_string += "GROUP_" + default_group_key + "="
        for mangled_name in default_groupings[default_group_key]:
            default_group_string += os.path.join(file_path_prefix,
                                                 mangled_name) + ";"
        if len(default_groupings[default_group_key]) > 0:
            default_group_string = default_group_string[:-1]
        output_group_file.write(default_group_string + "\n")
    """Determining output whether to use group mapping file or metadata file"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)
    groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.groupmapping_folder)
    attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.attributemapping_folder)

    if len(metadata_files_in_folder) > 1:
        print("Too many metafile inputted")
        exit(1)
    if len(metadata_files_in_folder) == 1:
        #Using metadatat file
        row_count, table_data = ming_fileio_library.parse_table_with_headers(
            metadata_files_in_folder[0])
        if not "filename" in table_data:
            print(
                "Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename"
            )
            exit(1)
        attributes_to_groups_mapping = defaultdict(set)
        group_to_files_mapping = defaultdict(list)
        for i in range(row_count):
            filename = table_data["filename"][i]
            basename_filename = os.path.basename(filename).rstrip()
            print(basename_filename, len(reverse_file_mangling.keys()))
            if basename_filename in reverse_file_mangling:
                mangled_name = reverse_file_mangling[basename_filename]
                for key in table_data:
                    if key.find("ATTRIBUTE_") != -1:
                        group_name = table_data[key][i]
                        if len(group_name) < 1:
                            continue
                        group_to_files_mapping[group_name].append(
                            os.path.join(file_path_prefix, mangled_name))
                        attributes_to_groups_mapping[key.replace(
                            "ATTRIBUTE_", "")].add(group_name)
            else:
                #Filename is not part of sample set
                print(basename_filename, "missing")
                continue

        for group_name in group_to_files_mapping:
            group_string = "GROUP_" + group_name + "=" + ";".join(
                group_to_files_mapping[group_name])
            output_group_file.write(group_string + "\n")

        for attribute_name in attributes_to_groups_mapping:
            attribute_string = attribute_name + "=" + ";".join(
                list(attributes_to_groups_mapping[attribute_name]))
            output_attribute_file.write(attribute_string + "\n")
        exit(0)
    """Falling back on old group mapping file"""
    if len(groupmapping_files_in_folder) > 1 or len(
            attributemapping_files_in_folder) > 1:
        print("Too many group/attribute mappings inputted")
        exit(1)

    if len(groupmapping_files_in_folder) == 1:
        for line in open(groupmapping_files_in_folder[0], errors='ignore'):
            splits = line.rstrip().split("=")
            if len(splits) < 2:
                continue

            group_name = splits[0]
            group_files = []
            for filename in splits[1].split(";"):
                if os.path.basename(filename) in reverse_file_mangling:
                    mangled_name = reverse_file_mangling[os.path.basename(
                        filename)]
                    group_files.append(
                        os.path.join(file_path_prefix, mangled_name))

            group_string = group_name + "=" + ";".join(group_files)
            output_group_file.write(group_string + "\n")

    if len(attributemapping_files_in_folder) == 1:
        for line in open(attributemapping_files_in_folder[0]):
            output_attribute_file.write(line)
Esempio n. 34
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('cluster_buckets', help='cluster_buckets')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_folder', help='output_folder')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r"))

    if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0":
        print("Do not do things")
        exit(0)

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object)

    """Reading Metadata File"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename" : real_name})
    else:
        object_list_temp = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0])
        #object_list_temp = pd.read_csv(metadata_files_in_folder[0], sep="\t")

        object_list = []
        for metadata_object in object_list_temp:
            if len(metadata_object["filename"]) > 1:
                object_list.append(metadata_object)
        
        #Adding all files, if analyzed file is not in list
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue

            found = False
            for metadata_object in object_list:
                if os.path.basename(real_name) == metadata_object["filename"]:
                    found = True
                    break

            if found is False:
                object_list.append({"filename" : real_name})

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            if "#SampleID" in metadata_object:
                metadata_object["#SampleID"] = metadata_object["#SampleID"]
            else:
                #Stripping off all non-alphanumeric characters
                #metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum())
                metadata_object["#SampleID"] = metadata_object["filename"]
        if not "Description" in metadata_object:
            metadata_object["Description"] = "LoremIpsum"
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        #Adding default grouping information
        try:
            mangled_name = reverse_file_mangling[metadata_object["filename"]]
            if mangled_name.find("spec-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
            elif mangled_name.find("spectwo-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
            elif mangled_name.find("specthree-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
            elif mangled_name.find("specfour-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
            elif mangled_name.find("specfive-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
            elif mangled_name.find("specsix-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"
        except:
            print(metadata_object["filename"], "Not Mapped")
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped"

    output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv")

    for metadatum in object_list:
        if "sample_name" in metadatum:
            if len(metadatum["sample_name"]) > 1:
                metadatum["#SampleID"] = metadatum["sample_name"]


    #Removing metadata filenames that are not in the actual data
    #analysis_files = 

    metadata_df = pd.DataFrame(object_list)
    metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list)

    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = metadata_df["#SampleID"]
    manifest_df["filepath"] = metadata_df["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    """Calling remote server to do the calculation"""
    SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024"
    files = {'manifest': open(output_manifest_filename, 'r'), \
    'metadata': open(output_metadata_filename, 'r'), \
    'bucket': open(args.cluster_buckets, 'r')}

    r_post = requests.post(SERVER_BASE + "/processclassic", files=files)
    response_dict = r_post.json()

    with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

    with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
Esempio n. 35
0
def main():
    parser = argparse.ArgumentParser(description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('workflow_parameters', help='output folder for parameters')
    parser.add_argument('result_file', help='output folder for parameters')
    parser.add_argument('msaccess_binary', help='output folder for parameters')
    parser.add_argument('--parallelism', default=1, type=int, help='Parallelism')
    args = parser.parse_args()


    params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()


    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    parameter_list = []
    for spectrum_file in spectra_files:
        param_dict = {}
        param_dict["spectrum_file"] = spectrum_file
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10)


    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        try:
            result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file)
            for result in result_list:
                output_dict = {}
                output_dict["Filename"] = result["Filename"]
                output_dict["Vendor"] = result["Vendor"]
                output_dict["Model"] = result["Model"]
                output_dict["MS1s"] = result["MS1s"]
                output_dict["MS2s"] = result["MS2s"]
                full_result_list.append(output_dict)
        except:
            #raise
            print("Error", input_file)

        #print(result_list)
        #full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["Filename"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
Esempio n. 36
0
def main():
    parser = argparse.ArgumentParser(description='Create parallel parameters')
    parser.add_argument('library_folder', help='Input mgf file to network')
    parser.add_argument('workflow_parameters',
                        help='proteosafe xml parameters')
    parser.add_argument('parameters_output_folder',
                        help='output folder for parameters')
    parser.add_argument('parameters_analog_output_folder',
                        help='output folder for analog parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))

    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)

    for i in range(args.parallelism):
        output_parameter_file = open(
            os.path.join(args.parameters_output_folder,
                         str(i) + ".params"), "w")
        output_analog_parameter_file = open(
            os.path.join(args.parameters_analog_output_folder,
                         str(i) + ".params"), "w")

        #Search Criteria
        output_parameter_file.write(
            "MIN_MATCHED_PEAKS_SEARCH=%s\n" %
            (params_object["MIN_MATCHED_PEAKS_SEARCH"][0]))
        output_parameter_file.write("TOP_K_RESULTS=%s\n" %
                                    (params_object["TOP_K_RESULTS"][0]))
        output_parameter_file.write(
            "search_peak_tolerance=%s\n" %
            (params_object["tolerance.Ion_tolerance"][0]))
        output_parameter_file.write(
            "search_parentmass_tolerance=%s\n" %
            (params_object["tolerance.PM_tolerance"][0]))
        output_parameter_file.write("ANALOG_SEARCH=%s\n" % ("0"))
        output_parameter_file.write("MAX_SHIFT_MASS=%s\n" %
                                    (params_object["MAX_SHIFT_MASS"][0]))

        output_analog_parameter_file.write(
            "MIN_MATCHED_PEAKS_SEARCH=%s\n" %
            (params_object["MIN_MATCHED_PEAKS_SEARCH"][0]))
        output_analog_parameter_file.write("TOP_K_RESULTS=%s\n" %
                                           (params_object["TOP_K_RESULTS"][0]))
        output_analog_parameter_file.write(
            "search_peak_tolerance=%s\n" %
            (params_object["tolerance.Ion_tolerance"][0]))
        output_analog_parameter_file.write(
            "search_parentmass_tolerance=%s\n" %
            (params_object["tolerance.PM_tolerance"][0]))
        output_analog_parameter_file.write("ANALOG_SEARCH=%s\n" %
                                           (params_object["ANALOG_SEARCH"][0]))
        output_analog_parameter_file.write(
            "MAX_SHIFT_MASS=%s\n" % (params_object["MAX_SHIFT_MASS"][0]))

        #Filtering Criteria
        output_parameter_file.write(
            "FILTER_PRECURSOR_WINDOW=%s\n" %
            (params_object["FILTER_PRECURSOR_WINDOW"][0]))
        output_parameter_file.write("MIN_PEAK_INT=%s\n" %
                                    (params_object["MIN_PEAK_INT"][0]))
        output_parameter_file.write("WINDOW_FILTER=%s\n" %
                                    (params_object["WINDOW_FILTER"][0]))
        output_parameter_file.write("FILTER_LIBRARY=%s\n" %
                                    (params_object["FILTER_LIBRARY"][0]))

        output_analog_parameter_file.write(
            "FILTER_PRECURSOR_WINDOW=%s\n" %
            (params_object["FILTER_PRECURSOR_WINDOW"][0]))
        output_analog_parameter_file.write("MIN_PEAK_INT=%s\n" %
                                           (params_object["MIN_PEAK_INT"][0]))
        output_analog_parameter_file.write("WINDOW_FILTER=%s\n" %
                                           (params_object["WINDOW_FILTER"][0]))
        output_analog_parameter_file.write(
            "FILTER_LIBRARY=%s\n" % (params_object["FILTER_LIBRARY"][0]))

        #Scoring Criteria
        output_parameter_file.write(
            "MIN_MATCHED_PEAKS_SEARCH=%s\n" %
            (params_object["MIN_MATCHED_PEAKS_SEARCH"][0]))
        output_parameter_file.write("SCORE_THRESHOLD=%s\n" %
                                    (params_object["SCORE_THRESHOLD"][0]))

        output_analog_parameter_file.write(
            "MIN_MATCHED_PEAKS_SEARCH=%s\n" %
            (params_object["MIN_MATCHED_PEAKS_SEARCH"][0]))
        output_analog_parameter_file.write(
            "SCORE_THRESHOLD=%s\n" % (params_object["SCORE_THRESHOLD"][0]))

        #Parallelism
        output_parameter_file.write("NODEIDX=%d\n" % (i))
        output_parameter_file.write("NODECOUNT=%d\n" % (args.parallelism))

        output_analog_parameter_file.write("NODEIDX=%d\n" % (i))
        output_analog_parameter_file.write("NODECOUNT=%d\n" %
                                           (args.parallelism))

        #Search Library
        output_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" %
                                    (" ".join(library_files)))

        output_analog_parameter_file.write("EXISTING_LIBRARY_MGF=%s\n" %
                                           (" ".join(library_files)))

        output_parameter_file.close()
        output_analog_parameter_file.close()
def main():
    parser = argparse.ArgumentParser(description='Creating Clustering Info Summary')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_file', help='output_metadata_file')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))

    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)

    default_group_mapping = defaultdict(list)
    file_to_group_mapping = {}
    for mangled_name in mangled_file_mapping:
        if mangled_name.find("specone-") != -1:
            default_group_mapping["G1"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1"
        if mangled_name.find("spectwo-") != -1:
            default_group_mapping["G2"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2"
        if mangled_name.find("specthree-") != -1:
            default_group_mapping["G3"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3"
        if mangled_name.find("specfour-") != -1:
            default_group_mapping["G4"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4"
        if mangled_name.find("specfive-") != -1:
            default_group_mapping["G5"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5"
        if mangled_name.find("specsix-") != -1:
            default_group_mapping["G6"].append(mangled_file_mapping[mangled_name])
            file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6"

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)

    row_count = 0
    table_data = defaultdict(list)
    if len(metadata_files_in_folder) == 1:
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

    print(table_data)
    for key in table_data:
        print(key, len(table_data[key]))

    for i in range(row_count):
        print(i)
        filename = table_data["filename"][i]
        if len(filename) < 2:
            continue
        print(filename, filename[0], filename[-1])

        if filename[0] == "\"":
            filename = filename[1:]
        if filename[-1] == "\"":
            filename = filename[:-1]

        table_data["filename"][i] = filename

        basename_filename = os.path.basename(filename)
        group_name = "NoDefaultGroup"
        if basename_filename in file_to_group_mapping:
            group_name = file_to_group_mapping[basename_filename]
        table_data["ATTRIBUTE_DefaultGroup"].append(group_name)



    for input_filename in file_to_group_mapping:
        if input_filename in table_data["filename"]:
            continue
        else:
            for key in table_data:
                if key != "ATTRIBUTE_DefaultGroup" and key != "filename":
                    table_data[key].append("N/A")

            table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename])
            table_data["filename"].append(input_filename)

    ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
Esempio n. 38
0
def main():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('cluster_buckets', help='cluster_buckets')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_folder', help='output_folder')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml, "r"))

    if param_object["CREATE_CLUSTER_BUCKETS"][0] == "0":
        print("Do not do things")
        exit(0)

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_object)

    """Reading Metadata File"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename" : real_name})
    else:
        print(metadata_files_in_folder[0])
        object_list = ming_fileio_library.parse_table_with_headers_object_list(metadata_files_in_folder[0])
        if len(object_list) == 0:
            for real_name in reverse_file_mangling:
                mangled_name = reverse_file_mangling[real_name]
                if mangled_name.find("spec") == -1:
                    continue
                object_list.append({"filename" : real_name})

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            if "#SampleID" in metadata_object:
                metadata_object["#SampleID"] = metadata_object["#SampleID"]
            else:
                #Stripping off all non-alphanumeric characters
                metadata_object["#SampleID"] = ''.join(ch for ch in metadata_object["filename"] if ch.isalnum())
        if not "Description" in metadata_object:
            metadata_object["Description"] = "LoremIpsum"
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        try:
            mangled_name = reverse_file_mangling[metadata_object["filename"]]
            if mangled_name.find("spec-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
            elif mangled_name.find("spectwo-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
            elif mangled_name.find("specthree-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
            elif mangled_name.find("specfour-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
            elif mangled_name.find("specfive-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
            elif mangled_name.find("specsix-") != -1:
                metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"
        except:
            print(metadata_object["filename"], "Not Mapped")
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "Not Mapped"

    output_metadata_filename = os.path.join(args.output_folder, "qiime2_metadata.tsv")
    output_manifest_filename = os.path.join(args.output_folder, "qiime2_manifest.tsv")

    for metadatum in object_list:
        if "sample_name" in metadatum:
            if len(metadatum["sample_name"]) > 1:
                metadatum["#SampleID"] = metadatum["sample_name"]

    metadata_df = pd.DataFrame(object_list)
    metadata_df.to_csv(output_metadata_filename, index=False, sep="\t", columns=header_list)

    """Outputting Manifest Filename"""
    manifest_df = pd.DataFrame()
    manifest_df["sample_name"] = metadata_df["#SampleID"]
    manifest_df["filepath"] = metadata_df["filename"]
    manifest_df.to_csv(output_manifest_filename, index=False, sep=",")

    """Calling remote server to do the calculation"""
    SERVER_BASE = "http://dorresteinappshub.ucsd.edu:5024"
    #SERVER_BASE = "http://mingwangbeta.ucsd.edu:5024"
    files = {'manifest': open(output_manifest_filename, 'r'), \
    'metadata': open(output_metadata_filename, 'r'), \
    'bucket': open(args.cluster_buckets, 'r')}


    r_post = requests.post(SERVER_BASE + "/processclassic", files=files)
    response_dict = r_post.json()

    with open(os.path.join(args.output_folder, "qiime2_table.qza"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["table_qza"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

    with open(os.path.join(args.output_folder, "qiime2_emperor.qzv"), 'wb') as f:
        r = requests.get(SERVER_BASE + response_dict["emperor_qzv"], stream=True)
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)
Esempio n. 39
0
def main():
    parser = argparse.ArgumentParser(description='Modifying script')
    parser.add_argument('param_xml', help='metadata_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_metadata_table', help='output_metadata_table')
    parser.add_argument('output_view_emporer', help='output_metadata_table')
    args = parser.parse_args()

    param_object = ming_proteosafe_library.parse_xml_file(
        open(args.param_xml, "r"))
    """Outputting html"""
    from urllib.parse import urlencode, quote_plus
    parameters_for_qiime = {
        'biom':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=biom_output/networking_quant.biom'
        % (param_object["task"][0]),
        'metadata':
        'http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=%s&block=main&file=metadata_for_qiime/metadata_for_qiime.txt'
        % (param_object["task"][0])
    }

    output_html_file = open(args.output_view_emporer, "w")
    output_html_file.write("<script>\n")
    output_html_file.write(
        'window.location.replace("https://mingwangbeta.ucsd.edu/emperor?%s")\n'
        % urlencode(parameters_for_qiime))
    output_html_file.write("</script>\n")
    output_html_file.close()

    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(
        param_object)

    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(
        args.metadata_folder)

    object_list = []

    if len(metadata_files_in_folder) != 1:
        for real_name in reverse_file_mangling:
            mangled_name = reverse_file_mangling[real_name]
            if mangled_name.find("spec") == -1:
                continue
            object_list.append({"filename": real_name})
        #open(args.output_metadata_table, "w").write("NO OUTPUT")
        #open(args.output_view_emporer, "w").write("Please Include Metadata File")
        #exit(0)
    else:
        object_list = ming_fileio_library.parse_table_with_headers_object_list(
            metadata_files_in_folder[0])

        if len(object_list) == 0:
            for real_name in reverse_file_mangling:
                mangled_name = reverse_file_mangling[real_name]
                if mangled_name.find("spec") == -1:
                    continue
                object_list.append({"filename": real_name})
            #open(args.output_metadata_table, "w").write("NO OUTPUT")
            #open(args.output_view_emporer, "w").write("Please Include Non Empty Metadata File")
            #exit(0)

    #Writing headers
    header_list = ["#SampleID", "BarcodeSequence", "LinkerPrimerSequence"]
    for key in object_list[0]:
        if not key in header_list:
            header_list.append(key)

    header_list.append("ATTRIBUTE_GNPSDefaultGroup")

    for metadata_object in object_list:
        if not "#SampleID" in metadata_object:
            metadata_object[
                "#SampleID"] = ming_fileio_library.get_filename_without_extension(
                    metadata_object["filename"])
        if not "BarcodeSequence" in metadata_object:
            metadata_object["BarcodeSequence"] = "GATACA"
        if not "LinkerPrimerSequence" in metadata_object:
            metadata_object["LinkerPrimerSequence"] = "GATACA"

        mangled_name = reverse_file_mangling[metadata_object["filename"]]
        if mangled_name.find("spec-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G1"
        elif mangled_name.find("spectwo-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G2"
        elif mangled_name.find("specthree-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G3"
        elif mangled_name.find("specfour-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G4"
        elif mangled_name.find("specfive-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G5"
        elif mangled_name.find("specsix-") != -1:
            metadata_object["ATTRIBUTE_GNPSDefaultGroup"] = "G6"

    ming_fileio_library.write_list_dict_table_data(object_list,
                                                   args.output_metadata_table,
                                                   header_list)
Esempio n. 40
0
def main():
    parser = argparse.ArgumentParser(
        description='Running library search parallel')
    parser.add_argument('spectra_folder', help='spectrafolder')
    parser.add_argument('json_parameters', help='proteosafe xml parameters')
    parser.add_argument('workflow_parameters',
                        help='output folder for parameters')
    parser.add_argument('library_folder', help='output folder for parameters')
    parser.add_argument('result_folder', help='output folder for parameters')
    parser.add_argument('convert_binary', help='output folder for parameters')
    parser.add_argument('librarysearch_binary',
                        help='output folder for parameters')
    parser.add_argument('--parallelism',
                        default=1,
                        type=int,
                        help='Parallelism')
    args = parser.parse_args()

    parallel_json = json.loads(open(args.json_parameters).read())

    params_object = ming_proteosafe_library.parse_xml_file(
        open(args.workflow_parameters))
    mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(
        params_object)
    library_files = ming_fileio_library.list_files_in_dir(args.library_folder)
    spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder)

    spectra_files.sort()

    print(spectra_files)
    spectra_files = spectra_files[
        parallel_json["node_partition"]::parallel_json["total_paritions"]]
    print(spectra_files)

    temp_folder = "temp"
    try:
        os.mkdir(temp_folder)
    except:
        print("folder error")

    tempresults_folder = "tempresults"
    try:
        os.mkdir(tempresults_folder)
    except:
        print("folder error")

    list_of_spectrumfiles = chunks(spectra_files, 5)
    parameter_list = []
    for spectrum_files_chunk in list_of_spectrumfiles:
        param_dict = {}
        param_dict["spectra_files"] = spectrum_files_chunk
        param_dict["temp_folder"] = temp_folder
        param_dict["tempresults_folder"] = tempresults_folder
        param_dict["args"] = args
        param_dict["params_object"] = params_object
        param_dict["library_files"] = library_files

        parameter_list.append(param_dict)

    #for param_dict in parameter_list:
    #    search_wrapper(param_dict)
    print("Parallel to execute", len(parameter_list))
    ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5)
    """Merging Files and adding full path"""
    all_result_files = ming_fileio_library.list_files_in_dir(
        tempresults_folder)
    full_result_list = []
    for input_file in all_result_files:
        result_list = ming_fileio_library.parse_table_with_headers_object_list(
            input_file)
        full_result_list += result_list

    for result_object in full_result_list:
        mangled_name = os.path.basename(result_object["SpectrumFile"])
        full_path = mangled_mapping[mangled_name]
        result_object["full_CCMS_path"] = full_path

    ming_fileio_library.write_list_dict_table_data(
        full_result_list,
        os.path.join(args.result_folder,
                     str(uuid.uuid4()) + ".tsv"))
def main():
    parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file')
    parser.add_argument('proteosafe_parameters', help='proteosafe_parameters')
    parser.add_argument('groupmapping_folder', help='groupmapping_folder')
    parser.add_argument('attributemapping_folder', help='attributemapping_folder')
    parser.add_argument('metadata_folder', help='metadata_folder')
    parser.add_argument('output_groupmapping_file', help='output_groupmapping_file')
    parser.add_argument('output_attributemapping_file', help='output_attributemapping_file')
    parser.add_argument('inputspectrafolder', help='inputspectrafolder')
    args = parser.parse_args()

    param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters))
    mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj)
    reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj)
    file_path_prefix = args.inputspectrafolder

    output_group_file = open(args.output_groupmapping_file, "w")
    output_attribute_file = open(args.output_attributemapping_file, "w")

    """
    Writing Default Grouping to output file
    """
    default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] }
    for mangled_name in mangled_file_mapping.keys():
        if mangled_name.find("spec-") != -1:
            default_groupings['G1'].append(mangled_name.rstrip())
        if mangled_name.find("spectwo-") != -1:
            default_groupings['G2'].append(mangled_name.rstrip())
        if mangled_name.find("specthree-") != -1:
            default_groupings['G3'].append(mangled_name.rstrip())
        if mangled_name.find("specfour-") != -1:
            default_groupings['G4'].append(mangled_name.rstrip())
        if mangled_name.find("specfive-") != -1:
            default_groupings['G5'].append(mangled_name.rstrip())
        if mangled_name.find("specsix-") != -1:
            default_groupings['G6'].append(mangled_name.rstrip())

    for default_group_key in default_groupings.keys():
        default_group_string = ""
        default_group_string += "GROUP_" + default_group_key +"="
        for mangled_name in default_groupings[default_group_key]:
            default_group_string += os.path.join(file_path_prefix, mangled_name) + ";"
        if len(default_groupings[default_group_key]) > 0:
            default_group_string = default_group_string[:-1]
        output_group_file.write(default_group_string + "\n")


    """Determining output whether to use group mapping file or metadata file"""
    metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder)
    groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder)
    attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder)

    if len(metadata_files_in_folder) > 1:
        print("Too many metafile inputted")
        exit(1)
    if len(metadata_files_in_folder) == 1:
        #Using metadatat file
        row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0])

        if not "filename" in table_data:
            print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename")
            exit(1)

        attributes_to_groups_mapping = defaultdict(set)
        group_to_files_mapping = defaultdict(list)
        for i in range(row_count):
            filename = table_data["filename"][i]
            basename_filename = os.path.basename(filename).rstrip()
            if basename_filename in reverse_file_mangling:
                mangled_name = reverse_file_mangling[basename_filename]
                for key in table_data:
                    if key.find("ATTRIBUTE_") != -1:
                        group_name = table_data[key][i]
                        if len(group_name) < 1:
                            continue
                        group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name))
                        attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name)
            else:
                #Filename is not part of sample set
                continue

        for group_name in group_to_files_mapping:
            group_string = "GROUP_" + group_name + "="  + ";".join(group_to_files_mapping[group_name])
            output_group_file.write(group_string + "\n")

        for attribute_name in attributes_to_groups_mapping:
            attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name]))
            output_attribute_file.write(attribute_string + "\n")
        exit(0)

    """Falling back on old group mapping file"""
    if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1:
        print("Too many group/attribute mappings inputted")
        exit(1)

    if len(groupmapping_files_in_folder) == 1:
        for line in open(groupmapping_files_in_folder[0], errors='ignore'):
            splits = line.rstrip().split("=")
            if len(splits) < 2:
                continue

            group_name = splits[0]
            group_files = []
            for filename in splits[1].split(";"):
                if os.path.basename(filename) in reverse_file_mangling:
                    mangled_name = reverse_file_mangling[os.path.basename(filename)]
                    group_files.append(os.path.join(file_path_prefix, mangled_name))

            group_string = group_name + "=" + ";".join(group_files)
            output_group_file.write(group_string + "\n")

    if len(attributemapping_files_in_folder) == 1:
        for line in open(attributemapping_files_in_folder[0]):
            output_attribute_file.write(line)