def ipa_ureg_validation(ipa_ureg_parameters): ureg_types_dict = {} # iterates for each supplied ureg parameter: for ureg_parameter in ipa_ureg_parameters: # required inputs ureg_file_path = None type = None # gets the sub-parameters sub_params_list = ureg_parameter.split(",") # checks the sub params for sub_param in sub_params_list: # Tests if there are two parts to the sub-parameter if len(sub_param.split("=")) != 2: print >> sys.stderr, "Error: the ipa_ureg parameter: " + ureg_parameter + " is not in the correct format (missing =)." sys.exit(1) # Tests the file sub-parameter if sub_param.upper().startswith("file=".upper()): ureg_file_path = sub_param.split("=")[1] # Tests if the gene set file can be opened: try: ureg_file = open(ureg_file_path).readlines() except: print >> sys.stderr, "Error: the ureg file: " + ureg_file_path + " cannot be opened." sys.exit(1) # Tests if the gene set file is in the correct format: line_counter = 0 for line in ureg_file: line_split = line.rstrip().split("\t") if len(line_split) != 3: print >> sys.stderr, "Error: the ureg file: " + ureg_file_path + " line " + str( line_counter) + "does not have exactly 3 columns." sys.exit(1) # Tests the type sub-parameter if sub_param.upper().startswith("type=".upper()): type = sub_param.split("=")[1].upper() # Tests the zscore sub-parameter if sub_param.upper().startswith("zscore=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (zscore)." sys.exit(1) # Tests the p.adj sub-parameter if sub_param.upper().startswith("p.adj=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (p.adj)." sys.exit(1) # Tests the log2fold sub-parameter if sub_param.upper().startswith("log2fold=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (log2fold)." sys.exit(1) # Tests the min set size sub-parameter if sub_param.upper().startswith("min_set_size=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (min_set_size)." sys.exit(1) # Tests the max set size sub-parameter if sub_param.upper().startswith("max_set_size=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (max_set_size)." sys.exit(1) # Tests the overlap ratio sub-parameter if sub_param.upper().startswith("network_overlap_ratio=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (network_overlap_ratio)." sys.exit(1) # Tests the overlap size sub-parameter if sub_param.upper().startswith("network_overlap_size=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (network_overlap_size)." sys.exit(1) # Checks for a unique gene sets type: if type in ureg_types_dict: print >> sys.stderr, "Error: the ipa_ureg parameter: " + ureg_parameter + " has does not have a unique type." sys.exit(1) else: ureg_types_dict[type] = True # tests if the required inputs have been supplied if ureg_file_path == None or type == None: print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (missing essential sub-parameters)." sys.exit(1) print "validated the ipa_ureg parameter: " + ureg_parameter
def pde_parsing(pde_workflow_parameters, global_variables): parsed_pde_parameters = [] for pde_parameter in pde_workflow_parameters: # default sub-parameters pde_file_path = None numerator_group = None denominator_group = None p_threshold = 0.05 fold_threshold = 1 order_list = None gl_file_path = None # gets the sub-parameters sub_params_list = pde_parameter.split(",") for sub_param in sub_params_list: if sub_param.upper().startswith("file=".upper()): pde_file_path = sub_param.split("=")[1] if sub_param.upper().startswith("numerator=".upper()): numerator_group = sub_param.split("=")[1].upper() if sub_param.upper().startswith("denominator=".upper()): denominator_group = sub_param.split("=")[1].upper() if sub_param.upper().startswith("p.adj=".upper()): p_threshold = float(sub_param.split("=")[1]) if sub_param.upper().startswith("log2fold=".upper()): fold_threshold = float(sub_param.split("=")[1]) if sub_param.upper().startswith("order=".upper()): order_list = sub_param.split("=")[1].split("+") order_list = [x.upper() for x in order_list] if sub_param.upper().startswith("gl=".upper()): gl_file_path = sub_param.split("=")[1] # Parses the gl if supplied if gl_file_path == None: gl_dict = None else: gl_dict = {} gl_file = open(gl_file_path).readlines() for line in gl_file: gl_dict[line.rstrip().upper()] = True # Parses the pde file: pde_file = open(pde_file_path).readlines() pde_dict = {} header = True for line in pde_file: if header: header = False else: gene, log2fold, p, padj = line.rstrip().split("\t") gene = gene.upper() sig = False if is_number(log2fold): log2fold_valid = True log2fold = round(float(log2fold), 2) else: log2fold_valid = False if is_number(p): p_valid = True p = float(p) else: p_valid = False if is_number(padj): padj_valid = True padj = float(padj) else: padj_valid = False if log2fold_valid and p_valid and padj_valid: valid = True if padj < p_threshold and abs(log2fold) > fold_threshold: sig = True else: sig = False else: valid = False in_gl = True if gl_dict != None: if gene not in gl_dict: in_gl = False # Stores the parsed gene pde_dict[gene] = { "sig": sig, "log2fold": log2fold, "p": p, "p.adj": padj, "valid": valid, "log2fold_valid": log2fold_valid, "p_valid": p_valid, "p.adj_valid": padj_valid, "in_gl": in_gl } # sets the order to the default (numerator+denominator) if unsupplied if order_list == None: order_list = [denominator_group, numerator_group] # Stores the parsed parameter: pde_parameter_dict = {} pde_parameter_dict["pde_dict"] = pde_dict pde_parameter_dict["numerator_group"] = numerator_group pde_parameter_dict["denominator_group"] = denominator_group pde_parameter_dict["p_threshold"] = p_threshold pde_parameter_dict["fold_threshold"] = fold_threshold pde_parameter_dict["order_list"] = order_list pde_parameter_dict["gl_dict"] = gl_dict pde_parameter_dict[ "pde_ID"] = numerator_group + " vs " + denominator_group pde_parameter_dict["pde_file_path"] = pde_file_path parsed_pde_parameters.append(pde_parameter_dict) print "parsed the pde parameter: " + pde_parameter global_variables["pde_parameters"] = parsed_pde_parameters return global_variables
def pde_validation(pde_workflow_parameters, sample_groups): for pde_parameter in pde_workflow_parameters: # required inputs pde_file_path = None numerator = None denominator = None # gets the sub-parameters sub_params_list = pde_parameter.split(",") # checks the sub params for sub_param in sub_params_list: # Tests if there are two parts to the sub-parameter if len(sub_param.split("=")) != 2: print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format." sys.exit(1) # Tests the file sub-parameter if sub_param.upper().startswith("file=".upper()): pde_file_path = sub_param.split("=")[1] # Tests if the pde file can be opened: try: pde_file = open(pde_file_path).readlines() except: print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " cannot be opened." sys.exit(1) # Tests if the pde file is in the correct format: line_counter = 1 for line in pde_file: line_split = line.rstrip().split("\t") if len(line_split) != 4: print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " line " + str( line_counter) + "does not have 4 columns." sys.exit(1) if line_counter == 1: if line_split[0].upper() != "ID": print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"ID\" as the first column in the header." sys.exit(1) if line_split[1].upper() != "LOG2FOLD": print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"LOG2FOLD\" as the second column in the header." sys.exit(1) if line_split[2].upper() != "P": print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"P\" as the third column in the header." sys.exit(1) if line_split[3].upper() != "P.ADJ": print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"P.ADJ\" as the fourth column in the header." sys.exit(1) line_counter += 1 # Tests the numerator sub-parameter if sub_param.upper().startswith("numerator=".upper()): numerator = sub_param.split("=")[1] if not is_sample_group(numerator.upper(), sample_groups): print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " \"numerator=\" group is not a valid sample group" sys.exit(1) # Tests the numerator sub-parameter if sub_param.upper().startswith("denominator=".upper()): denominator = sub_param.split("=")[1] if not is_sample_group(denominator.upper(), sample_groups): print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " \"denominator=\" group is not a valid sample group" sys.exit(1) # Tests the p.adj sub-parameter if sub_param.upper().startswith("p.adj=".upper()): padj = sub_param.split("=")[1] if not is_number(padj): print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format." sys.exit(1) # Tests the log2fold sub-parameter if sub_param.upper().startswith("p.adj=".upper()): log2fold = sub_param.split("=")[1] if not is_number(log2fold): print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format." sys.exit(1) # Tests the order sub-parameter if sub_param.upper().startswith("order=".upper()): order_sample_groups = sub_param.split("=")[1].split("+") if len(order_sample_groups) < 2: print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " does not have at least 2 sample groups in \" order=\"" sys.exit(1) for sample_group in order_sample_groups: if not is_sample_group(sample_group.upper(), sample_groups): print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " has a sample group in \"order=\" that is not in the sample sheet" sys.exit(1) # Tests the gene list sub-parameter if sub_param.upper().startswith("gl=".upper()): gl_file_path = sub_param.split("=")[1] try: gl_file = open(gl_file_path).readlines() except: print >> sys.stderr, "Error: the GL file: " + gl_file_path + " cannot be opened." sys.exit(1) # tests if the required inputs have been supplied if pde_file_path == None or numerator == None or denominator == None: print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format." sys.exit(1) print "validated the pde parameter: " + pde_parameter
def mde_validation(mde_workflow_parameters, sample_groups, de_parameters): # gets the names of the des de_keys = {} for de_parameter in de_parameters: de_keys[de_parameter["de_ID"]] = True mde_names = {} for mde_parameter in mde_workflow_parameters: # required inputs mde_name = None # gets the sub-parameters sub_params_list = mde_parameter.split(",") #number of des de_count = 0 # checks the sub params for sub_param in sub_params_list: # Tests the name sub-parameter if sub_param.upper().startswith("name=".upper()): if len(sub_param.split("=")) != 2: print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format." sys.exit(1) mde_name = sub_param.split("=")[1] if mde_name.upper() in mde_names: print >> sys.stderr, "Error: the Mde name " + mde_name + " is used more than once." sys.exit(1) mde_names[mde_name] = True # Tests the order sub-parameter if sub_param.upper().startswith("order=".upper()): if len(sub_param.split("=")) != 2: print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (order=)." sys.exit(1) order = sub_param.split("=")[1].split("+") if len(order) < 2: print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (> 2 sample groups in order=)" sys.exit(1) for sample_group in order: if not is_sample_group(sample_group.upper(), sample_groups): print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (order sample group missing from sample sheet" sys.exit(1) # Tests the de sub-parameters if sub_param.upper().startswith("numerator=".upper()): if len(sub_param.split("*")) != 2: print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (numerator=)." sys.exit(1) numerator_string, denominator_string = sub_param.split("*") if len(numerator_string.split("=")) != 2: print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (numerator=x*deonminator=y)." sys.exit(1) if not denominator_string.upper().startswith( "denominator=".upper()): print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " is not in the correct format (denominator=)." sys.exit(1) if len(denominator_string.split("=")) != 2: print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (denominator=)." sys.exit(1) numerator = numerator_string.split("=")[1] denominator = denominator_string.split("=")[1] if not is_sample_group(numerator.upper(), sample_groups): print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (numerator sample group missing from sample sheet)." sys.exit(1) if not is_sample_group(denominator.upper(), sample_groups): print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (denominator sample group missing from sample sheet)." sys.exit(1) de_ID = numerator.upper() + " vs " + denominator.upper() if de_ID not in de_keys: print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " is not in the correct format (no de parameter of same name): " + de_ID sys.exit(1) de_count += 1 # Tests the correlation cut-off sub-parameter if sub_param.upper().startswith("scc=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " is not in the correct format (scc is not a number)" sys.exit(1) # tests if the required inputs have been supplied if de_count < 2: print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " has fewer than 2 des." sys.exit(1) if mde_name == None: print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (missing name=)." sys.exit(1) print "validated the mde parameter: " + mde_parameter
def background_validation(bg_parameter, global_variables): # required inputs background_file_path = None # gets the sub-parameters sub_params_list = bg_parameter.split(",") # checks the sub params for sub_param in sub_params_list: # Tests if there are two parts to the sub-parameter if len(sub_param.split("=")) != 2: print >> sys.stderr, "Error: the background parameter is not in a valid format." sys.exit(1) # Tests the file sub-parameter if sub_param.upper().startswith("file=".upper()): background_file_path = sub_param.split("=")[1] # Tests if the background file can be opened: try: background_file = open(background_file_path).readlines() except: print >> sys.stderr, "Error: the background file: \"" + str( background_file_path) + "\" cannot be opened." sys.exit(1) line_counter = 1 gene_IDs_dict = {} for line in background_file: line_split = line.rstrip().split("\t") # Validates the header line if line_counter == 1: accepted_col_headers = { "ID": True, "SYMBOL": True, "BIOTYPE": True, "CHROMOSOME": True, "START": True, "STOP": True } header_dict = {} for index in range(0, len(line_split)): header_dict[line_split[index].upper()] = index if line_split[index].upper( ) not in accepted_col_headers: print >> sys.stderr, "Error: the background file column header: " + line_split[ index].upper( ) + " is not an accepted column header, e.g. " + "\t\t".join( accepted_col_headers.keys()) sys.exit(1) if "ID" not in header_dict: print >> sys.stderr, "Error: there must be a column called \"ID\" in the background file." sys.exit(1) #Sets up the types of background information in the global variables if "SYMBOL" in header_dict: global_variables["GENE_SYMBOL_FLAG"] = True if "BIOTYPE" in header_dict: global_variables["GENE_BIOTYPE_FLAG"] = True if "CHROMOSOME" in header_dict: global_variables["GENE_CHROMOSOME_FLAG"] = True if "START" in header_dict: global_variables["GENE_START_FLAG"] = True if "STOP" in header_dict: global_variables["GENE_STOP_FLAG"] = True if global_variables[ "GENE_CHROMOSOME_FLAG"] and global_variables[ "GENE_START_FLAG"] and global_variables[ "GENE_STOP_FLAG"]: global_variables["GENE_COORDINATES_FLAG"] = True # Validates the genes else: if line_split[header_dict["ID"]] in gene_IDs_dict: print >> sys.stderr, "Error: line " + str( line_counter ) + " of the background file has a duplicate gene ID. Gene IDs MUST be unique." sys.exit(1) gene_IDs_dict[line_split[header_dict["ID"]]] = True if len(line_split) != len(header_dict): print >> sys.stderr, "Error: line " + str( line_counter ) + " of the background file has more columns than the header line." sys.exit(1) if global_variables["GENE_START_FLAG"]: if not is_number(line_split[header_dict["START"]]): print >> sys.stderr, "Error: line " + str( line_counter ) + " of the background file has a start coordinate that is not a number." sys.exit(1) if global_variables["GENE_STOP_FLAG"]: if not is_number(line_split[header_dict["STOP"]]): print >> sys.stderr, "Error: line " + str( line_counter ) + " of the background file has a stop coordinate that is not a number." sys.exit(1) line_counter += 1 # tests if the required inputs have been supplied if background_file_path == None: print >> sys.stderr, "Error: the background parameter is not in a valid format." sys.exit(1) print "validated the background parameter" return global_variables
def normexp_validation(norm_exp_parameter, sample_list): # required inputs norm_exp_file_path = None # gets the sub-parameters sub_params_list = norm_exp_parameter.split(",") # checks the sub params for sub_param in sub_params_list: # Tests if there are two parts to the sub-parameter if len(sub_param.split("=")) != 2: print >> sys.stderr, "Error: the normexp parameter is not in a valid format." sys.exit(1) # Tests if the expression threshold is in the correct format if sub_param.upper().startswith("expressed=".upper()): if not is_number(sub_param.split("=")[1]): print >> sys.stderr, "Error: the normexp parameter is not in a valid format." sys.exit(1) # Tests the file sub-parameter if sub_param.upper().startswith("file=".upper()): norm_exp_file_path = sub_param.split("=")[1] # Tests if the normexp file can be opened try: norm_exp_file = open(norm_exp_file_path).readlines() except: print >> sys.stderr, "Error: the normexp file: \"" + str( norm_exp_file_path) + "\" cannot be opened." sys.exit(1) # checks the normexp file contents line_counter = 1 for line in norm_exp_file: line_split = line.rstrip().split("\t") # Validates the header line (samples etc) if line_counter == 1: if line_split[0].upper() != "ID": print >> sys.stderr, "Error: the first column in normexp file is not called \"ID\"." sys.exit(1) samples = line_split[1:] # Checks the diffexp and ss have the same number of samples: if len(samples) > len(sample_list): print >> sys.stderr, "Error: there are more samples in the normexp file than in the sample sheet." sys.exit(1) if len(samples) < len(sample_list): print >> sys.stderr, "Error: there are fewer samples in the normexp file than in the sample sheet." sys.exit(1) # Checks the samples in the normexp are also in the ss for sample in samples: if not is_sample(sample.upper(), sample_list): print >> sys.stderr, "Error: in the normexp file the sample " + sample.upper( ) + " is not in the sample sheet." sys.exit(1) else: cells = line_split[1:] for cell in cells: if not is_number(cell): print >> sys.stderr, "Error: the normexp file has a cell that is not a number at line " + str( line_counter) + "." sys.exit(1) line_counter += 1 # tests if the required inputs have been supplied if norm_exp_file_path == None: print >> sys.stderr, "Error: the normexp parameter is not in a valid format." sys.exit(1) print "validated the normexp parameter"