def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename, metadata_mapping): output_file = open(output_filename, "w") line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) clusters_in_network = set() for row in csv.DictReader(open(clusterinfosummary_filename), delimiter='\t'): clusters_in_network.add(row["cluster index"]) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if not(cluster_number in clusters_in_network): continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += max(float(table_data["#PrecIntensity"][i]), 1.0) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) output_header_list = [] output_header_list.append("#OTU ID") for header in mangled_mapping.keys(): if header.find("spec") == -1: continue if os.path.basename(mangled_mapping[header]) in metadata_mapping: output_header_list.append(metadata_mapping[os.path.basename(mangled_mapping[header])]) else: output_header_list.append(ming_fileio_library.get_filename_without_extension(os.path.basename(mangled_mapping[header]))) output_file.write("\t".join(output_header_list) + "\n") for cluster_idx in cluster_index_to_file_map: line_output_list = [] line_output_list.append(str(cluster_idx)) #line_string = str(cluster_idx) + "\t" for header in mangled_mapping.keys(): if header.find("spec") == -1: continue line_output_list.append(str(cluster_index_to_file_map[cluster_idx][header])) #line_string += str(cluster_index_to_file_map[cluster_idx][header]) + "\t" #print line_string #output_file.write(line_string + "\n") output_file.write("\t".join(line_output_list) + "\n") output_file.close()
def create_bucket_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, output_filename): param_object = ming_proteosafe_library.parse_xml_file( open(param_filename, "r")) output_file = open(output_filename, "w") if param_object["CREATE_CLUSTER_BUCKETS"][0] != "1": output_file.write("No Output") return test_network = molecular_network_library.MolecularNetwork() test_network.load_clustersummary(clusterinfosummary_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers( cluster_info_filename) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if test_network.get_cluster_index(cluster_number) == None: continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][ mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = { "filename": table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i] } all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) output_header = "#OTU ID\t" for header in mangled_mapping.keys(): output_header += os.path.basename(mangled_mapping[header]) + "\t" output_file.write(output_header + "\n") for cluster_idx in cluster_index_to_file_map: line_string = str(cluster_idx) + "\t" for header in mangled_mapping.keys(): line_string += str( cluster_index_to_file_map[cluster_idx][header]) + "\t" #print line_string output_file.write(line_string + "\n")
def determine_filenames_to_load(my_node_number, params_obj, path_to_existing_library, path_to_new_library_spectra): existing_library_filename = "" new_library_filename = "" basic_filename = str(my_node_number) + ".json" mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_obj) existing_library_files = ming_fileio_library.list_files_in_dir(path_to_existing_library) new_library_spectra_files = ming_fileio_library.list_files_in_dir(path_to_new_library_spectra) for filename in existing_library_files: base_filename = os.path.basename(filename) unmangled_name = mangled_file_mapping[base_filename] if os.path.basename(unmangled_name) == basic_filename: existing_library_filename = os.path.join(path_to_existing_library, base_filename) for filename in new_library_spectra_files: base_filename = os.path.basename(filename) unmangled_name = mangled_file_mapping[base_filename] if os.path.basename(unmangled_name) == basic_filename: new_library_filename = os.path.join(path_to_new_library_spectra, base_filename) return existing_library_filename, new_library_filename
def load_parameters_file(self, paramsfilename): #Loading the file mapping parameters = ming_proteosafe_library.parse_xml_file( open(paramsfilename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( parameters) self.mangled_mapping = mangled_mapping
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename): output_file = open(output_filename, "w") test_network = molecular_network_library.MolecularNetwork() test_network.load_clustersummary(clusterinfosummary_filename) line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if test_network.get_cluster_index(cluster_number) == None: continue if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) all_headers = ["filename", "X", "Y", "Z", "radius"] for cluster_idx in cluster_index_to_file_map: all_headers.append(cluster_idx) #writing header output_file.write(",".join(all_headers) + "\n") for sample_name in mangled_mapping: if sample_name.find("spec") == -1: continue real_filename = mangled_mapping[sample_name] if not os.path.basename(real_filename) in filename_coordinate_mapping: continue line_output = [real_filename] coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)] line_output.append(coordinate_object["x"]) line_output.append(coordinate_object["y"]) line_output.append(coordinate_object["z"]) line_output.append(coordinate_object["radius"]) print(line_output, coordinate_object) for cluster_idx in cluster_index_to_file_map: line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name])) output_file.write(",".join(line_output) + "\n") output_file.close()
def create_ili_output_from_clusterinfo(cluster_info_filename, param_filename, clusterinfosummary_filename, filename_coordinate_mapping, output_filename): output_file = open(output_filename, "w") line_counts, table_data = ming_fileio_library.parse_table_with_headers(cluster_info_filename) param_object = ming_proteosafe_library.parse_xml_file(open(param_filename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_object) cluster_index_to_file_map = {} clusters_map = {} all_files = {} for i in range(line_counts): cluster_number = table_data["#ClusterIdx"][i] if not (cluster_number in clusters_map): clusters_map[cluster_number] = [] cluster_index_to_file_map[cluster_number] = {} #Adding all file names to mapping for mangled_name in mangled_mapping.keys(): cluster_index_to_file_map[cluster_number][mangled_name] = 0.0 #print table_data["#Filename"][i].split("/")[1] mangled_filename_only = os.path.basename(table_data["#Filename"][i]) cluster_index_to_file_map[cluster_number][mangled_filename_only] += float(table_data["#PrecIntensity"][i]) spectrum_info = {"filename":table_data["#Filename"][i], "intensity": table_data["#PrecIntensity"][i]} all_files[table_data["#Filename"][i]] = 1 clusters_map[cluster_number].append(spectrum_info) all_headers = ["filename", "X", "Y", "Z", "radius"] for cluster_idx in cluster_index_to_file_map: all_headers.append(cluster_idx) #writing header output_file.write(",".join(all_headers) + "\n") for sample_name in mangled_mapping: if sample_name.find("spec") == -1: continue real_filename = mangled_mapping[sample_name] if not os.path.basename(real_filename) in filename_coordinate_mapping: continue line_output = [real_filename] coordinate_object = filename_coordinate_mapping[os.path.basename(real_filename)] line_output.append(coordinate_object["x"]) line_output.append(coordinate_object["y"]) line_output.append(coordinate_object["z"]) line_output.append(coordinate_object["radius"]) print(line_output, coordinate_object) for cluster_idx in cluster_index_to_file_map: line_output.append(str(cluster_index_to_file_map[cluster_idx][sample_name])) output_file.write(",".join(line_output) + "\n") output_file.close()
def main(): parser = argparse.ArgumentParser( description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clustersummary', help='input_clustersummary') parser.add_argument('input_clusterinfo', help='input_clusterinfo') parser.add_argument('output_clusterinfo', help='output_clusterinfo') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open( args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) #Creating acceptable clusters to include in cluster info included_clusters = set() for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'): included_clusters.add(row["cluster index"]) with open(args.input_clusterinfo) as input_clusterinfo: field_names = [ "cluster index", "AllFiles", "sum(precursor intensity)", "RTMean", "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath", "Original_Path" ] output_clusterinfo_writer = csv.DictWriter(open( args.output_clusterinfo, "w"), fieldnames=field_names, delimiter='\t') output_clusterinfo_writer.writeheader() input_clusterinfo_reader = csv.DictReader(input_clusterinfo, delimiter='\t') for row in input_clusterinfo_reader: if not (row["#ClusterIdx"] in included_clusters): continue output_dict = {} output_dict["cluster index"] = row["#ClusterIdx"] output_dict["AllFiles"] = row["#Filename"] output_dict["sum(precursor intensity)"] = row["#PrecIntensity"] output_dict["RTMean"] = row["#RetTime"] output_dict["RTStdErr"] = "0" output_dict["parent mass"] = row["#ParentMass"] output_dict["ScanNumber"] = row["#Scan"] output_dict["ProteosafeFilePath"] = os.path.join( "spec", os.path.basename(row["#Filename"])) output_dict["Original_Path"] = "f." + mangled_mapping[ os.path.basename(row["#Filename"])] output_clusterinfo_writer.writerow(output_dict) exit(0)
def main(): input_file_of_tsv_results = sys.argv[1] input_params_xml_filename = sys.argv[2] input_library_identifications_filename = sys.argv[3] input_cutoff_scores = sys.argv[4] output_folder = sys.argv[5] output_filename = os.path.join(output_folder, os.path.basename(input_file_of_tsv_results)) params_object = ming_proteosafe_library.parse_xml_file(open(input_params_xml_filename)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_scans_to_identification = library_scans_to_identification_info(input_library_identifications_filename) cutoff_dict = json.loads(open(input_cutoff_scores).read()) psm_list = ming_psm_library.parse_MSGFPlus_tsvfile(input_file_of_tsv_results) output_results_dict = process_ambiguity(psm_list, mangled_mapping, library_scans_to_identification, cutoff_dict) ming_fileio_library.write_dictionary_table_data(output_results_dict, output_filename)
def name_demangle_filenames(input_file, output_file, path_to_param, old_filename_header, new_filename_header): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( ming_proteosafe_library.parse_xml_file(open(path_to_param))) if old_filename_header == new_filename_header: for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header][i] = unmangled_name else: table_data[new_filename_header] = [] for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header].append(unmangled_name) ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main(): parser = argparse.ArgumentParser(description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clustersummary', help='input_clustersummary') parser.add_argument('input_clusterinfo', help='input_clusterinfo') parser.add_argument('output_clusterinfo', help='output_clusterinfo') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) #Creating acceptable clusters to include in cluster info included_clusters = set() for row in csv.DictReader(open(args.input_clustersummary), delimiter='\t'): included_clusters.add(row["cluster index"]) with open(args.input_clusterinfo) as input_clusterinfo: field_names = ["cluster index", "AllFiles", "sum(precursor intensity)", "RTMean", "RTStdErr", "parent mass", "ScanNumber", "ProteosafeFilePath", "Original_Path"] output_clusterinfo_writer = csv.DictWriter(open(args.output_clusterinfo, "w"), fieldnames=field_names, delimiter='\t') output_clusterinfo_writer.writeheader() input_clusterinfo_reader = csv.DictReader(input_clusterinfo, delimiter='\t') for row in input_clusterinfo_reader: if not (row["#ClusterIdx"] in included_clusters): continue output_dict = {} output_dict["cluster index"] = row["#ClusterIdx"] output_dict["AllFiles"] = row["#Filename"] output_dict["sum(precursor intensity)"] = row["#PrecIntensity"] output_dict["RTMean"] = row["#RetTime"] output_dict["RTStdErr"] = "0" output_dict["parent mass"] = row["#ParentMass"] output_dict["ScanNumber"] = row["#Scan"] output_dict["ProteosafeFilePath"] = os.path.join("spec", os.path.basename(row["#Filename"])) output_dict["Original_Path"] = "f." + mangled_mapping[os.path.basename(row["#Filename"])] output_clusterinfo_writer.writerow(output_dict) exit(0)
def name_demangle_filenames_and_instrument_collision(input_file, output_file, path_to_param, path_to_original_results, old_filename_header, new_filename_header): row_count, table_data = ming_fileio_library.parse_table_with_headers( input_file, skip_incomplete_lines=True) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( ming_proteosafe_library.parse_xml_file(open(path_to_param))) if not "FragMethod" in table_data: print("Demangling", path_to_original_results, input_file) collision_mapping = get_scan_mapping_for_collision_method( path_to_original_results) #Adding collision column table_data["FragMethod"] = [] print(len(table_data["filename"]), len(table_data["scan"])) for i in range(row_count): key = table_data["filename"][i] + "_" + table_data["scan"][i] if key in collision_mapping: table_data["FragMethod"].append(collision_mapping[key]) else: table_data["FragMethod"].append("NO_COLLISION") if old_filename_header == new_filename_header: for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header][i] = unmangled_name else: table_data[new_filename_header] = [] for i in range(row_count): mangled_name = table_data[old_filename_header][i] unmangled_name = mangled_mapping[mangled_name] table_data[new_filename_header].append(unmangled_name) ming_fileio_library.write_dictionary_table_data(table_data, output_file)
def main(): parser = argparse.ArgumentParser( description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file( open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[ parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir( tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list( input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data( full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser( description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file( open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append( mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename( mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers( metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append( file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def process(param_xml, metadata_folder, output_metadata_folder): params_object = ming_proteosafe_library.parse_xml_file(open(param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) input_metadata_filenames = glob.glob(os.path.join(metadata_folder, "*")) user_metadata_df = None if len(input_metadata_filenames) == 1: user_metadata_df = pd.read_csv(input_metadata_filenames[0], sep="\t") if len(input_metadata_filenames) > 1: print("You have selected too many metadata files, please only select one") exit(1) # We didnt input metadata file, lets see what we can do with sheets if len(input_metadata_filenames) == 0: try: from urllib.parse import urlparse sheets_url = params_object["googlesheetsmetadata"][0] if len(sheets_url) > 10: parsed_url = urlparse(sheets_url) path = parsed_url.path path_splits = path.split("/") sheets_id = path_splits[3] json_url = "https://gnps-sheets-proxy.herokuapp.com/sheets.json?sheets_id={}".format(sheets_id) r = requests.get(json_url) user_metadata_df = pd.DataFrame(r.json()) except: pass # Merging Default Groups in # default_group_list = [] # for mangled_name in mangled_mapping.keys(): # group_dict = {} # group_dict["filename"] = os.path.basename(mangled_mapping[mangled_name]) # if mangled_name.find("spec-") != -1: # group_dict["DefaultGroup"] = "G1" # if mangled_name.find("specone-") != -1: # group_dict["DefaultGroup"] = "G1" # if mangled_name.find("spectwo-") != -1: # group_dict["DefaultGroup"] = "G2" # if mangled_name.find("specthree-") != -1: # group_dict["DefaultGroup"] = "G3" # if mangled_name.find("specfour-") != -1: # group_dict["DefaultGroup"] = "G4" # if mangled_name.find("specfive-") != -1: # group_dict["DefaultGroup"] = "G5" # if mangled_name.find("specsix-") != -1: # group_dict["DefaultGroup"] = "G6" # if len(group_dict) > 1: # default_group_list.append(group_dict) # default_metadata_df = pd.DataFrame(default_group_list) # if user_metadata_df is not None: # merged_metadata_df = default_metadata_df.merge(user_metadata_df, how="outer", on="filename") # else: # merged_metadata_df = default_metadata_df merged_metadata_df = user_metadata_df if merged_metadata_df is not None: output_metadata_filename = os.path.join(output_metadata_folder, "gnps_metadata.tsv") merged_metadata_df.to_csv(output_metadata_filename, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list used_files = set() for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path result_object["CCMS_filename"] = os.path.basename(full_path) used_files.add(full_path) for mangled_name in spectra_files: full_path = mangled_mapping[os.path.basename(mangled_name)] if full_path in used_files: continue output_dict = {} output_dict["full_CCMS_path"] = full_path output_dict["CCMS_filename"] = os.path.basename(full_path) full_result_list.append(output_dict) pd.DataFrame(full_result_list).to_csv(args.result_file, sep="\t", index=False)
def main(): parser = argparse.ArgumentParser(description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file') parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename') parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename') parser.add_argument('input_networking_pairs', help='input_networking_pairs') parser.add_argument('input_library_search', help='input_library_search') parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename') args = parser.parse_args() """Loading group filenames""" group_to_files, files_to_groups = load_group_mapping(args.input_group_mapping_filename) print("Loaded Group Mapping") cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list(args.input_clusterinfosummary_file) print("Loaded Cluster Summary") attribute_to_groups = load_attribute_mapping(args.input_attribute_mapping_filename) params_object = ming_proteosafe_library.parse_xml_file(open(args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0]) RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0] #Calculating the spectrum counts per group cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0)) cluster_to_files = defaultdict(set) cluster_to_RT = defaultdict(list) line_count = 0 for line in open(args.input_clusterinfo_file): line_count += 1 if line_count == 1: continue if line_count % 10000 == 0: print(line_count) splits = line.rstrip().split("\t") cluster_index = splits[0] filename = os.path.basename(splits[1]) rt = float(splits[6]) group_membership = files_to_groups[filename] cluster_to_files[cluster_index].add(filename) cluster_to_RT[cluster_index].append(rt) for group in group_membership: cluster_to_group_counts[cluster_index][group] += 1 if RUN_MSCLUSTER == "on": cluster_summary_list = filter_clusters_based_on_cluster_size(cluster_summary_list, CLUSTER_MIN_SIZE) print(len(cluster_summary_list)) print("Setting up grouping", len(group_to_files.keys())) for cluster_summary_object in cluster_summary_list: cluster_index = cluster_summary_object["cluster index"] for group in group_to_files: group_count = 0 if group in cluster_to_group_counts[cluster_index]: group_count = cluster_to_group_counts[cluster_index][group] cluster_summary_object[group] = group_count for attribute in attribute_to_groups: groups_to_include = [] for group in attribute_to_groups[attribute]: if group in cluster_summary_object: if cluster_summary_object[group] > 0: groups_to_include.append(group) cluster_summary_object[attribute] = ",".join(groups_to_include).replace("GNPSGROUP:", "") print("Default Attributes") calculate_default_attributes(cluster_summary_list, group_to_files.keys()) print("calculate_cluster_file_stats") calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping) print("rt stats") calculate_rt_stats(cluster_summary_list, cluster_to_RT) print("calculate_ancillary_information") calculate_ancillary_information(cluster_summary_list, params_object["task"][0]) print("populate_network_component") populate_network_component(cluster_summary_list, args.input_networking_pairs) print("populate_network_identifications") populate_network_identifications(cluster_summary_list, args.input_library_search) ming_fileio_library.write_list_dict_table_data(cluster_summary_list, args.output_clusterinfosummary_filename)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('result_file', help='output folder for parameters') parser.add_argument('msaccess_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") parameter_list = [] for spectrum_file in spectra_files: param_dict = {} param_dict["spectrum_file"] = spectrum_file param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(summary_wrapper, parameter_list, 10) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: try: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) for result in result_list: output_dict = {} output_dict["Filename"] = result["Filename"] output_dict["Vendor"] = result["Vendor"] output_dict["Model"] = result["Model"] output_dict["MS1s"] = result["MS1s"] output_dict["MS2s"] = result["MS2s"] full_result_list.append(output_dict) except: #raise print("Error", input_file) #print(result_list) #full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["Filename"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, args.result_file)
def main(): parser = argparse.ArgumentParser(description='Creating Clustering Info Summary') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_metadata_file', help='output_metadata_file') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) default_group_mapping = defaultdict(list) file_to_group_mapping = {} for mangled_name in mangled_file_mapping: if mangled_name.find("specone-") != -1: default_group_mapping["G1"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G1" if mangled_name.find("spectwo-") != -1: default_group_mapping["G2"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G2" if mangled_name.find("specthree-") != -1: default_group_mapping["G3"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G3" if mangled_name.find("specfour-") != -1: default_group_mapping["G4"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G4" if mangled_name.find("specfive-") != -1: default_group_mapping["G5"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G5" if mangled_name.find("specsix-") != -1: default_group_mapping["G6"].append(mangled_file_mapping[mangled_name]) file_to_group_mapping[os.path.basename(mangled_file_mapping[mangled_name])] = "G6" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) row_count = 0 table_data = defaultdict(list) if len(metadata_files_in_folder) == 1: row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) print(table_data) for key in table_data: print(key, len(table_data[key])) for i in range(row_count): print(i) filename = table_data["filename"][i] if len(filename) < 2: continue print(filename, filename[0], filename[-1]) if filename[0] == "\"": filename = filename[1:] if filename[-1] == "\"": filename = filename[:-1] table_data["filename"][i] = filename basename_filename = os.path.basename(filename) group_name = "NoDefaultGroup" if basename_filename in file_to_group_mapping: group_name = file_to_group_mapping[basename_filename] table_data["ATTRIBUTE_DefaultGroup"].append(group_name) for input_filename in file_to_group_mapping: if input_filename in table_data["filename"]: continue else: for key in table_data: if key != "ATTRIBUTE_DefaultGroup" and key != "filename": table_data[key].append("N/A") table_data["ATTRIBUTE_DefaultGroup"].append(file_to_group_mapping[input_filename]) table_data["filename"].append(input_filename) ming_fileio_library.write_dictionary_table_data(table_data, args.output_metadata_file)
def main(): parser = argparse.ArgumentParser(description='Running library search parallel') parser.add_argument('spectra_folder', help='spectrafolder') parser.add_argument('json_parameters', help='proteosafe xml parameters') parser.add_argument('workflow_parameters', help='output folder for parameters') parser.add_argument('library_folder', help='output folder for parameters') parser.add_argument('result_folder', help='output folder for parameters') parser.add_argument('convert_binary', help='output folder for parameters') parser.add_argument('librarysearch_binary', help='output folder for parameters') parser.add_argument('--parallelism', default=1, type=int, help='Parallelism') args = parser.parse_args() parallel_json = json.loads(open(args.json_parameters).read()) params_object = ming_proteosafe_library.parse_xml_file(open(args.workflow_parameters)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(params_object) library_files = ming_fileio_library.list_files_in_dir(args.library_folder) spectra_files = ming_fileio_library.list_files_in_dir(args.spectra_folder) spectra_files.sort() print(spectra_files) spectra_files = spectra_files[parallel_json["node_partition"]::parallel_json["total_paritions"]] print(spectra_files) temp_folder = "temp" try: os.mkdir(temp_folder) except: print("folder error") tempresults_folder = "tempresults" try: os.mkdir(tempresults_folder) except: print("folder error") list_of_spectrumfiles = chunks(spectra_files, 5) parameter_list = [] for spectrum_files_chunk in list_of_spectrumfiles: param_dict = {} param_dict["spectra_files"] = spectrum_files_chunk param_dict["temp_folder"] = temp_folder param_dict["tempresults_folder"] = tempresults_folder param_dict["args"] = args param_dict["params_object"] = params_object param_dict["library_files"] = library_files parameter_list.append(param_dict) #for param_dict in parameter_list: # search_wrapper(param_dict) print("Parallel to execute", len(parameter_list)) ming_parallel_library.run_parallel_job(search_wrapper, parameter_list, 5) """Merging Files and adding full path""" all_result_files = ming_fileio_library.list_files_in_dir(tempresults_folder) full_result_list = [] for input_file in all_result_files: result_list = ming_fileio_library.parse_table_with_headers_object_list(input_file) full_result_list += result_list for result_object in full_result_list: mangled_name = os.path.basename(result_object["SpectrumFile"]) full_path = mangled_mapping[mangled_name] result_object["full_CCMS_path"] = full_path ming_fileio_library.write_list_dict_table_data(full_result_list, os.path.join(args.result_folder, str(uuid.uuid4()) + ".tsv"))
def main(): parser = argparse.ArgumentParser( description='Group Mapping from input, defaults and metadata file') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('groupmapping_folder', help='groupmapping_folder') parser.add_argument('attributemapping_folder', help='attributemapping_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_groupmapping_file', help='output_groupmapping_file') parser.add_argument('output_attributemapping_file', help='output_attributemapping_file') parser.add_argument('inputspectrafolder', help='inputspectrafolder') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file( open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping( param_obj) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping( param_obj) print(reverse_file_mangling.keys()) file_path_prefix = args.inputspectrafolder output_group_file = open(args.output_groupmapping_file, "w") output_attribute_file = open(args.output_attributemapping_file, "w") """ Writing Default Grouping to output file """ default_groupings = { 'G1': [], 'G2': [], 'G3': [], 'G4': [], 'G5': [], 'G6': [] } for mangled_name in mangled_file_mapping.keys(): if mangled_name.find("spec-") != -1: default_groupings['G1'].append(mangled_name.rstrip()) if mangled_name.find("spectwo-") != -1: default_groupings['G2'].append(mangled_name.rstrip()) if mangled_name.find("specthree-") != -1: default_groupings['G3'].append(mangled_name.rstrip()) if mangled_name.find("specfour-") != -1: default_groupings['G4'].append(mangled_name.rstrip()) if mangled_name.find("specfive-") != -1: default_groupings['G5'].append(mangled_name.rstrip()) if mangled_name.find("specsix-") != -1: default_groupings['G6'].append(mangled_name.rstrip()) for default_group_key in default_groupings.keys(): default_group_string = "" default_group_string += "GROUP_" + default_group_key + "=" for mangled_name in default_groupings[default_group_key]: default_group_string += os.path.join(file_path_prefix, mangled_name) + ";" if len(default_groupings[default_group_key]) > 0: default_group_string = default_group_string[:-1] output_group_file.write(default_group_string + "\n") """Determining output whether to use group mapping file or metadata file""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir( args.metadata_folder) groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir( args.groupmapping_folder) attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir( args.attributemapping_folder) if len(metadata_files_in_folder) > 1: print("Too many metafile inputted") exit(1) if len(metadata_files_in_folder) == 1: #Using metadatat file row_count, table_data = ming_fileio_library.parse_table_with_headers( metadata_files_in_folder[0]) if not "filename" in table_data: print( "Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename" ) exit(1) attributes_to_groups_mapping = defaultdict(set) group_to_files_mapping = defaultdict(list) for i in range(row_count): filename = table_data["filename"][i] basename_filename = os.path.basename(filename).rstrip() print(basename_filename, len(reverse_file_mangling.keys())) if basename_filename in reverse_file_mangling: mangled_name = reverse_file_mangling[basename_filename] for key in table_data: if key.find("ATTRIBUTE_") != -1: group_name = table_data[key][i] if len(group_name) < 1: continue group_to_files_mapping[group_name].append( os.path.join(file_path_prefix, mangled_name)) attributes_to_groups_mapping[key.replace( "ATTRIBUTE_", "")].add(group_name) else: #Filename is not part of sample set print(basename_filename, "missing") continue for group_name in group_to_files_mapping: group_string = "GROUP_" + group_name + "=" + ";".join( group_to_files_mapping[group_name]) output_group_file.write(group_string + "\n") for attribute_name in attributes_to_groups_mapping: attribute_string = attribute_name + "=" + ";".join( list(attributes_to_groups_mapping[attribute_name])) output_attribute_file.write(attribute_string + "\n") exit(0) """Falling back on old group mapping file""" if len(groupmapping_files_in_folder) > 1 or len( attributemapping_files_in_folder) > 1: print("Too many group/attribute mappings inputted") exit(1) if len(groupmapping_files_in_folder) == 1: for line in open(groupmapping_files_in_folder[0], errors='ignore'): splits = line.rstrip().split("=") if len(splits) < 2: continue group_name = splits[0] group_files = [] for filename in splits[1].split(";"): if os.path.basename(filename) in reverse_file_mangling: mangled_name = reverse_file_mangling[os.path.basename( filename)] group_files.append( os.path.join(file_path_prefix, mangled_name)) group_string = group_name + "=" + ";".join(group_files) output_group_file.write(group_string + "\n") if len(attributemapping_files_in_folder) == 1: for line in open(attributemapping_files_in_folder[0]): output_attribute_file.write(line)
def load_parameters_file(self, paramsfilename): #Loading the file mapping parameters = ming_proteosafe_library.parse_xml_file(open(paramsfilename, "r")) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping(parameters) self.mangled_mapping = mangled_mapping
def main(): parser = argparse.ArgumentParser( description='Creates enriched cluster info summary') parser.add_argument('param_xml', help='param_xml') parser.add_argument('input_clusterinfo_file', help='input_clusterinfo_file') parser.add_argument('input_clusterinfosummary_file', help='input_clusterinfosummary_file') parser.add_argument('input_group_mapping_filename', help='input_group_mapping_filename') parser.add_argument('input_attribute_mapping_filename', help='input_attribute_mapping_filename') parser.add_argument('input_networking_pairs', help='input_networking_pairs') parser.add_argument('input_library_search', help='input_library_search') parser.add_argument('output_clusterinfosummary_filename', help='output_clusterinfosummary_filename') args = parser.parse_args() """Loading group filenames""" group_to_files, files_to_groups = load_group_mapping( args.input_group_mapping_filename) print("Loaded Group Mapping") cluster_summary_list = ming_fileio_library.parse_table_with_headers_object_list( args.input_clusterinfosummary_file) print("Loaded Cluster Summary") attribute_to_groups = load_attribute_mapping( args.input_attribute_mapping_filename) params_object = ming_proteosafe_library.parse_xml_file(open( args.param_xml)) mangled_mapping = ming_proteosafe_library.get_mangled_file_mapping( params_object) CLUSTER_MIN_SIZE = int(params_object["CLUSTER_MIN_SIZE"][0]) RUN_MSCLUSTER = params_object["RUN_MSCLUSTER"][0] #Calculating the spectrum counts per group cluster_to_group_counts = defaultdict(lambda: defaultdict(lambda: 0)) cluster_to_files = defaultdict(set) cluster_to_RT = defaultdict(list) line_count = 0 for line in open(args.input_clusterinfo_file): line_count += 1 if line_count == 1: continue if line_count % 10000 == 0: print(line_count) splits = line.rstrip().split("\t") cluster_index = splits[0] filename = os.path.basename(splits[1]) rt = float(splits[6]) group_membership = files_to_groups[filename] cluster_to_files[cluster_index].add(filename) cluster_to_RT[cluster_index].append(rt) for group in group_membership: cluster_to_group_counts[cluster_index][group] += 1 if RUN_MSCLUSTER == "on": cluster_summary_list = filter_clusters_based_on_cluster_size( cluster_summary_list, CLUSTER_MIN_SIZE) print(len(cluster_summary_list)) print("Setting up grouping", len(group_to_files.keys())) for cluster_summary_object in cluster_summary_list: cluster_index = cluster_summary_object["cluster index"] for group in group_to_files: group_count = 0 if group in cluster_to_group_counts[cluster_index]: group_count = cluster_to_group_counts[cluster_index][group] cluster_summary_object[group] = group_count for attribute in attribute_to_groups: groups_to_include = [] for group in attribute_to_groups[attribute]: if group in cluster_summary_object: if cluster_summary_object[group] > 0: groups_to_include.append(group) cluster_summary_object[attribute] = ",".join( groups_to_include).replace("GNPSGROUP:", "") print("Default Attributes") calculate_default_attributes(cluster_summary_list, group_to_files.keys()) print("calculate_cluster_file_stats") calculate_cluster_file_stats(cluster_summary_list, cluster_to_files, mangled_mapping) print("rt stats") calculate_rt_stats(cluster_summary_list, cluster_to_RT) print("populate_network_component") populate_network_component(cluster_summary_list, args.input_networking_pairs) print("calculate_ancillary_information") calculate_ancillary_information(cluster_summary_list, params_object["task"][0]) print("populate_network_identifications") populate_network_identifications(cluster_summary_list, args.input_library_search) ming_fileio_library.write_list_dict_table_data( cluster_summary_list, args.output_clusterinfosummary_filename)
def main(): parser = argparse.ArgumentParser(description='Group Mapping from input, defaults and metadata file') parser.add_argument('proteosafe_parameters', help='proteosafe_parameters') parser.add_argument('groupmapping_folder', help='groupmapping_folder') parser.add_argument('attributemapping_folder', help='attributemapping_folder') parser.add_argument('metadata_folder', help='metadata_folder') parser.add_argument('output_groupmapping_file', help='output_groupmapping_file') parser.add_argument('output_attributemapping_file', help='output_attributemapping_file') parser.add_argument('inputspectrafolder', help='inputspectrafolder') args = parser.parse_args() param_obj = ming_proteosafe_library.parse_xml_file(open(args.proteosafe_parameters)) mangled_file_mapping = ming_proteosafe_library.get_mangled_file_mapping(param_obj) reverse_file_mangling = ming_proteosafe_library.get_reverse_mangled_file_mapping(param_obj) file_path_prefix = args.inputspectrafolder output_group_file = open(args.output_groupmapping_file, "w") output_attribute_file = open(args.output_attributemapping_file, "w") """ Writing Default Grouping to output file """ default_groupings = {'G1' : [] , 'G2' : [] ,'G3' : [] ,'G4' : [] ,'G5' : [] ,'G6' : [] } for mangled_name in mangled_file_mapping.keys(): if mangled_name.find("spec-") != -1: default_groupings['G1'].append(mangled_name.rstrip()) if mangled_name.find("spectwo-") != -1: default_groupings['G2'].append(mangled_name.rstrip()) if mangled_name.find("specthree-") != -1: default_groupings['G3'].append(mangled_name.rstrip()) if mangled_name.find("specfour-") != -1: default_groupings['G4'].append(mangled_name.rstrip()) if mangled_name.find("specfive-") != -1: default_groupings['G5'].append(mangled_name.rstrip()) if mangled_name.find("specsix-") != -1: default_groupings['G6'].append(mangled_name.rstrip()) for default_group_key in default_groupings.keys(): default_group_string = "" default_group_string += "GROUP_" + default_group_key +"=" for mangled_name in default_groupings[default_group_key]: default_group_string += os.path.join(file_path_prefix, mangled_name) + ";" if len(default_groupings[default_group_key]) > 0: default_group_string = default_group_string[:-1] output_group_file.write(default_group_string + "\n") """Determining output whether to use group mapping file or metadata file""" metadata_files_in_folder = ming_fileio_library.list_files_in_dir(args.metadata_folder) groupmapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.groupmapping_folder) attributemapping_files_in_folder = ming_fileio_library.list_files_in_dir(args.attributemapping_folder) if len(metadata_files_in_folder) > 1: print("Too many metafile inputted") exit(1) if len(metadata_files_in_folder) == 1: #Using metadatat file row_count, table_data = ming_fileio_library.parse_table_with_headers(metadata_files_in_folder[0]) if not "filename" in table_data: print("Missing 'filename' header in metadata file. Please specify the file name that goes along with each piece of metadata with the header: filename") exit(1) attributes_to_groups_mapping = defaultdict(set) group_to_files_mapping = defaultdict(list) for i in range(row_count): filename = table_data["filename"][i] basename_filename = os.path.basename(filename).rstrip() if basename_filename in reverse_file_mangling: mangled_name = reverse_file_mangling[basename_filename] for key in table_data: if key.find("ATTRIBUTE_") != -1: group_name = table_data[key][i] if len(group_name) < 1: continue group_to_files_mapping[group_name].append(os.path.join(file_path_prefix, mangled_name)) attributes_to_groups_mapping[key.replace("ATTRIBUTE_", "")].add(group_name) else: #Filename is not part of sample set continue for group_name in group_to_files_mapping: group_string = "GROUP_" + group_name + "=" + ";".join(group_to_files_mapping[group_name]) output_group_file.write(group_string + "\n") for attribute_name in attributes_to_groups_mapping: attribute_string = attribute_name + "=" + ";".join(list(attributes_to_groups_mapping[attribute_name])) output_attribute_file.write(attribute_string + "\n") exit(0) """Falling back on old group mapping file""" if len(groupmapping_files_in_folder) > 1 or len(attributemapping_files_in_folder) > 1: print("Too many group/attribute mappings inputted") exit(1) if len(groupmapping_files_in_folder) == 1: for line in open(groupmapping_files_in_folder[0], errors='ignore'): splits = line.rstrip().split("=") if len(splits) < 2: continue group_name = splits[0] group_files = [] for filename in splits[1].split(";"): if os.path.basename(filename) in reverse_file_mangling: mangled_name = reverse_file_mangling[os.path.basename(filename)] group_files.append(os.path.join(file_path_prefix, mangled_name)) group_string = group_name + "=" + ";".join(group_files) output_group_file.write(group_string + "\n") if len(attributemapping_files_in_folder) == 1: for line in open(attributemapping_files_in_folder[0]): output_attribute_file.write(line)