Beispiel #1
0
def ipa_ureg_validation(ipa_ureg_parameters):

    ureg_types_dict = {}

    # iterates for each supplied ureg parameter:
    for ureg_parameter in ipa_ureg_parameters:

        # required inputs
        ureg_file_path = None
        type = None

        # gets the sub-parameters
        sub_params_list = ureg_parameter.split(",")

        # checks the sub params
        for sub_param in sub_params_list:

            # Tests if there are two parts to the sub-parameter
            if len(sub_param.split("=")) != 2:
                print >> sys.stderr, "Error: the ipa_ureg parameter: " + ureg_parameter + " is not in the correct format (missing =)."
                sys.exit(1)

            # Tests the file sub-parameter
            if sub_param.upper().startswith("file=".upper()):
                ureg_file_path = sub_param.split("=")[1]

                # Tests if the gene set file can be opened:
                try:
                    ureg_file = open(ureg_file_path).readlines()
                except:
                    print >> sys.stderr, "Error: the ureg file: " + ureg_file_path + " cannot be opened."
                    sys.exit(1)

                # Tests if the gene set file is in the correct format:
                line_counter = 0
                for line in ureg_file:
                    line_split = line.rstrip().split("\t")

                    if len(line_split) != 3:
                        print >> sys.stderr, "Error: the ureg file: " + ureg_file_path + " line " + str(
                            line_counter) + "does not have exactly 3 columns."
                        sys.exit(1)

            # Tests the type sub-parameter
            if sub_param.upper().startswith("type=".upper()):
                type = sub_param.split("=")[1].upper()

            # Tests the zscore sub-parameter
            if sub_param.upper().startswith("zscore=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (zscore)."
                    sys.exit(1)

            # Tests the p.adj sub-parameter
            if sub_param.upper().startswith("p.adj=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (p.adj)."
                    sys.exit(1)

            # Tests the log2fold sub-parameter
            if sub_param.upper().startswith("log2fold=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (log2fold)."
                    sys.exit(1)

            # Tests the min set size sub-parameter
            if sub_param.upper().startswith("min_set_size=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (min_set_size)."
                    sys.exit(1)

            # Tests the max set size sub-parameter
            if sub_param.upper().startswith("max_set_size=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (max_set_size)."
                    sys.exit(1)

            # Tests the overlap ratio sub-parameter
            if sub_param.upper().startswith("network_overlap_ratio=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (network_overlap_ratio)."
                    sys.exit(1)

            # Tests the overlap size sub-parameter
            if sub_param.upper().startswith("network_overlap_size=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (network_overlap_size)."
                    sys.exit(1)

        # Checks for a unique gene sets type:
        if type in ureg_types_dict:
            print >> sys.stderr, "Error: the ipa_ureg parameter: " + ureg_parameter + " has does not have a unique type."
            sys.exit(1)
        else:
            ureg_types_dict[type] = True

        # tests if the required inputs have been supplied
        if ureg_file_path == None or type == None:
            print >> sys.stderr, "Error: the hypergeom_gs parameter: " + ureg_parameter + " is not in the correct format (missing essential sub-parameters)."
            sys.exit(1)

        print "validated the ipa_ureg parameter: " + ureg_parameter
def pde_parsing(pde_workflow_parameters, global_variables):

    parsed_pde_parameters = []
    for pde_parameter in pde_workflow_parameters:

        # default sub-parameters
        pde_file_path = None
        numerator_group = None
        denominator_group = None
        p_threshold = 0.05
        fold_threshold = 1
        order_list = None
        gl_file_path = None

        # gets the sub-parameters
        sub_params_list = pde_parameter.split(",")
        for sub_param in sub_params_list:
            if sub_param.upper().startswith("file=".upper()):
                pde_file_path = sub_param.split("=")[1]
            if sub_param.upper().startswith("numerator=".upper()):
                numerator_group = sub_param.split("=")[1].upper()
            if sub_param.upper().startswith("denominator=".upper()):
                denominator_group = sub_param.split("=")[1].upper()
            if sub_param.upper().startswith("p.adj=".upper()):
                p_threshold = float(sub_param.split("=")[1])
            if sub_param.upper().startswith("log2fold=".upper()):
                fold_threshold = float(sub_param.split("=")[1])
            if sub_param.upper().startswith("order=".upper()):
                order_list = sub_param.split("=")[1].split("+")
                order_list = [x.upper() for x in order_list]
            if sub_param.upper().startswith("gl=".upper()):
                gl_file_path = sub_param.split("=")[1]

        # Parses the gl if supplied
        if gl_file_path == None:
            gl_dict = None
        else:
            gl_dict = {}
            gl_file = open(gl_file_path).readlines()
            for line in gl_file:
                gl_dict[line.rstrip().upper()] = True

        # Parses the pde file:
        pde_file = open(pde_file_path).readlines()
        pde_dict = {}
        header = True

        for line in pde_file:

            if header:
                header = False
            else:
                gene, log2fold, p, padj = line.rstrip().split("\t")
                gene = gene.upper()
                sig = False

                if is_number(log2fold):
                    log2fold_valid = True
                    log2fold = round(float(log2fold), 2)
                else:
                    log2fold_valid = False
                if is_number(p):
                    p_valid = True
                    p = float(p)
                else:
                    p_valid = False
                if is_number(padj):
                    padj_valid = True
                    padj = float(padj)
                else:
                    padj_valid = False

                if log2fold_valid and p_valid and padj_valid:
                    valid = True

                    if padj < p_threshold and abs(log2fold) > fold_threshold:
                        sig = True
                    else:
                        sig = False
                else:
                    valid = False

                in_gl = True
                if gl_dict != None:
                    if gene not in gl_dict:
                        in_gl = False

                # Stores the parsed gene
                pde_dict[gene] = {
                    "sig": sig,
                    "log2fold": log2fold,
                    "p": p,
                    "p.adj": padj,
                    "valid": valid,
                    "log2fold_valid": log2fold_valid,
                    "p_valid": p_valid,
                    "p.adj_valid": padj_valid,
                    "in_gl": in_gl
                }

        # sets the order to the default (numerator+denominator) if unsupplied
        if order_list == None:
            order_list = [denominator_group, numerator_group]

        # Stores the parsed parameter:
        pde_parameter_dict = {}
        pde_parameter_dict["pde_dict"] = pde_dict
        pde_parameter_dict["numerator_group"] = numerator_group
        pde_parameter_dict["denominator_group"] = denominator_group
        pde_parameter_dict["p_threshold"] = p_threshold
        pde_parameter_dict["fold_threshold"] = fold_threshold
        pde_parameter_dict["order_list"] = order_list
        pde_parameter_dict["gl_dict"] = gl_dict
        pde_parameter_dict[
            "pde_ID"] = numerator_group + " vs " + denominator_group
        pde_parameter_dict["pde_file_path"] = pde_file_path

        parsed_pde_parameters.append(pde_parameter_dict)
        print "parsed the pde parameter: " + pde_parameter

    global_variables["pde_parameters"] = parsed_pde_parameters

    return global_variables
Beispiel #3
0
def pde_validation(pde_workflow_parameters, sample_groups):

    for pde_parameter in pde_workflow_parameters:

        # required inputs
        pde_file_path = None
        numerator = None
        denominator = None

        # gets the sub-parameters
        sub_params_list = pde_parameter.split(",")

        # checks the sub params
        for sub_param in sub_params_list:

            # Tests if there are two parts to the sub-parameter
            if len(sub_param.split("=")) != 2:
                print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format."
                sys.exit(1)

            # Tests the file sub-parameter
            if sub_param.upper().startswith("file=".upper()):
                pde_file_path = sub_param.split("=")[1]

                # Tests if the pde file can be opened:
                try:
                    pde_file = open(pde_file_path).readlines()
                except:
                    print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " cannot be opened."
                    sys.exit(1)

                # Tests if the pde file is in the correct format:
                line_counter = 1
                for line in pde_file:
                    line_split = line.rstrip().split("\t")

                    if len(line_split) != 4:
                        print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " line " + str(
                            line_counter) + "does not have 4 columns."
                        sys.exit(1)

                    if line_counter == 1:
                        if line_split[0].upper() != "ID":
                            print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"ID\"  as the first column in the header."
                            sys.exit(1)
                        if line_split[1].upper() != "LOG2FOLD":
                            print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"LOG2FOLD\"  as the second column in the header."
                            sys.exit(1)
                        if line_split[2].upper() != "P":
                            print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"P\"  as the third column in the header."
                            sys.exit(1)
                        if line_split[3].upper() != "P.ADJ":
                            print >> sys.stderr, "Error: the PDE file: " + pde_file_path + " does not have \"P.ADJ\"  as the fourth column in the header."
                            sys.exit(1)

                    line_counter += 1

            # Tests the numerator sub-parameter
            if sub_param.upper().startswith("numerator=".upper()):
                numerator = sub_param.split("=")[1]
                if not is_sample_group(numerator.upper(), sample_groups):
                    print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " \"numerator=\" group is not a valid sample group"
                    sys.exit(1)

            # Tests the numerator sub-parameter
            if sub_param.upper().startswith("denominator=".upper()):
                denominator = sub_param.split("=")[1]
                if not is_sample_group(denominator.upper(), sample_groups):
                    print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " \"denominator=\" group is not a valid sample group"
                    sys.exit(1)

            # Tests the p.adj sub-parameter
            if sub_param.upper().startswith("p.adj=".upper()):
                padj = sub_param.split("=")[1]
                if not is_number(padj):
                    print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format."
                    sys.exit(1)

            # Tests the log2fold sub-parameter
            if sub_param.upper().startswith("p.adj=".upper()):
                log2fold = sub_param.split("=")[1]
                if not is_number(log2fold):
                    print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format."
                    sys.exit(1)

            # Tests the order sub-parameter
            if sub_param.upper().startswith("order=".upper()):
                order_sample_groups = sub_param.split("=")[1].split("+")

                if len(order_sample_groups) < 2:
                    print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " does not have at least 2 sample groups in \" order=\""
                    sys.exit(1)

                for sample_group in order_sample_groups:
                    if not is_sample_group(sample_group.upper(),
                                           sample_groups):
                        print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " has a sample group in \"order=\" that is not in the sample sheet"
                        sys.exit(1)

            # Tests the gene list sub-parameter
            if sub_param.upper().startswith("gl=".upper()):
                gl_file_path = sub_param.split("=")[1]
                try:
                    gl_file = open(gl_file_path).readlines()
                except:
                    print >> sys.stderr, "Error: the GL file: " + gl_file_path + " cannot be opened."
                    sys.exit(1)

        # tests if the required inputs have been supplied
        if pde_file_path == None or numerator == None or denominator == None:
            print >> sys.stderr, "Error: the PDE parameter: " + pde_parameter + " is not in the correct format."
            sys.exit(1)

        print "validated the pde parameter: " + pde_parameter
Beispiel #4
0
def mde_validation(mde_workflow_parameters, sample_groups, de_parameters):

    # gets the names of the des
    de_keys = {}
    for de_parameter in de_parameters:
        de_keys[de_parameter["de_ID"]] = True

    mde_names = {}

    for mde_parameter in mde_workflow_parameters:

        # required inputs
        mde_name = None

        # gets the sub-parameters
        sub_params_list = mde_parameter.split(",")

        #number of des
        de_count = 0

        # checks the sub params
        for sub_param in sub_params_list:

            # Tests the name sub-parameter
            if sub_param.upper().startswith("name=".upper()):

                if len(sub_param.split("=")) != 2:
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format."
                    sys.exit(1)

                mde_name = sub_param.split("=")[1]

                if mde_name.upper() in mde_names:
                    print >> sys.stderr, "Error: the Mde name " + mde_name + " is used more than once."
                    sys.exit(1)
                mde_names[mde_name] = True

            # Tests the order sub-parameter
            if sub_param.upper().startswith("order=".upper()):
                if len(sub_param.split("=")) != 2:
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (order=)."
                    sys.exit(1)

                order = sub_param.split("=")[1].split("+")

                if len(order) < 2:
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (> 2 sample groups in order=)"
                    sys.exit(1)

                for sample_group in order:
                    if not is_sample_group(sample_group.upper(),
                                           sample_groups):
                        print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (order sample group missing from sample sheet"
                        sys.exit(1)

            # Tests the de sub-parameters
            if sub_param.upper().startswith("numerator=".upper()):
                if len(sub_param.split("*")) != 2:
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (numerator=)."
                    sys.exit(1)

                numerator_string, denominator_string = sub_param.split("*")

                if len(numerator_string.split("=")) != 2:
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (numerator=x*deonminator=y)."
                    sys.exit(1)
                if not denominator_string.upper().startswith(
                        "denominator=".upper()):
                    print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " is not in the correct format (denominator=)."
                    sys.exit(1)
                if len(denominator_string.split("=")) != 2:
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (denominator=)."
                    sys.exit(1)

                numerator = numerator_string.split("=")[1]
                denominator = denominator_string.split("=")[1]

                if not is_sample_group(numerator.upper(), sample_groups):
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (numerator sample group missing from sample sheet)."
                    sys.exit(1)
                if not is_sample_group(denominator.upper(), sample_groups):
                    print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (denominator sample group missing from sample sheet)."
                    sys.exit(1)

                de_ID = numerator.upper() + " vs " + denominator.upper()
                if de_ID not in de_keys:
                    print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " is not in the correct format (no de parameter of same name): " + de_ID
                    sys.exit(1)

                de_count += 1

            # Tests the correlation cut-off sub-parameter
            if sub_param.upper().startswith("scc=".upper()):
                if not is_number(sub_param.split("=")[1]):
                    print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " is not in the correct format (scc is not a number)"
                    sys.exit(1)

        # tests if the required inputs have been supplied
        if de_count < 2:
            print >> sys.stderr, "Error: the Mde parameter " + mde_parameter + " has fewer than 2 des."
            sys.exit(1)

        if mde_name == None:
            print >> sys.stderr, "Error: the Mde parameter: " + mde_parameter + " is not in the correct format (missing name=)."
            sys.exit(1)

        print "validated the mde parameter: " + mde_parameter
def background_validation(bg_parameter, global_variables):

    # required inputs
    background_file_path = None

    # gets the sub-parameters
    sub_params_list = bg_parameter.split(",")

    # checks the sub params
    for sub_param in sub_params_list:

        # Tests if there are two parts to the sub-parameter
        if len(sub_param.split("=")) != 2:
            print >> sys.stderr, "Error: the background parameter is not in a valid format."
            sys.exit(1)

        # Tests the file sub-parameter
        if sub_param.upper().startswith("file=".upper()):
            background_file_path = sub_param.split("=")[1]

            # Tests if the background file can be opened:
            try:
                background_file = open(background_file_path).readlines()
            except:
                print >> sys.stderr, "Error: the background file: \"" + str(
                    background_file_path) + "\" cannot be opened."
                sys.exit(1)

            line_counter = 1
            gene_IDs_dict = {}
            for line in background_file:

                line_split = line.rstrip().split("\t")

                # Validates the header line
                if line_counter == 1:
                    accepted_col_headers = {
                        "ID": True,
                        "SYMBOL": True,
                        "BIOTYPE": True,
                        "CHROMOSOME": True,
                        "START": True,
                        "STOP": True
                    }
                    header_dict = {}

                    for index in range(0, len(line_split)):

                        header_dict[line_split[index].upper()] = index

                        if line_split[index].upper(
                        ) not in accepted_col_headers:
                            print >> sys.stderr, "Error: the background file column header: " + line_split[
                                index].upper(
                                ) + " is not an accepted column header, e.g. " + "\t\t".join(
                                    accepted_col_headers.keys())
                            sys.exit(1)

                    if "ID" not in header_dict:
                        print >> sys.stderr, "Error: there must be a column called \"ID\" in the background file."
                        sys.exit(1)

                    #Sets up the types of background information in the global variables
                    if "SYMBOL" in header_dict:
                        global_variables["GENE_SYMBOL_FLAG"] = True
                    if "BIOTYPE" in header_dict:
                        global_variables["GENE_BIOTYPE_FLAG"] = True
                    if "CHROMOSOME" in header_dict:
                        global_variables["GENE_CHROMOSOME_FLAG"] = True
                    if "START" in header_dict:
                        global_variables["GENE_START_FLAG"] = True
                    if "STOP" in header_dict:
                        global_variables["GENE_STOP_FLAG"] = True
                    if global_variables[
                            "GENE_CHROMOSOME_FLAG"] and global_variables[
                                "GENE_START_FLAG"] and global_variables[
                                    "GENE_STOP_FLAG"]:
                        global_variables["GENE_COORDINATES_FLAG"] = True

                # Validates the genes
                else:
                    if line_split[header_dict["ID"]] in gene_IDs_dict:
                        print >> sys.stderr, "Error: line " + str(
                            line_counter
                        ) + " of the background file has a duplicate gene ID. Gene IDs MUST be unique."
                        sys.exit(1)

                    gene_IDs_dict[line_split[header_dict["ID"]]] = True

                    if len(line_split) != len(header_dict):
                        print >> sys.stderr, "Error: line " + str(
                            line_counter
                        ) + " of the background file has more columns than the header line."
                        sys.exit(1)

                    if global_variables["GENE_START_FLAG"]:
                        if not is_number(line_split[header_dict["START"]]):
                            print >> sys.stderr, "Error: line " + str(
                                line_counter
                            ) + " of the background file has a start coordinate that is not a number."
                            sys.exit(1)

                    if global_variables["GENE_STOP_FLAG"]:
                        if not is_number(line_split[header_dict["STOP"]]):
                            print >> sys.stderr, "Error: line " + str(
                                line_counter
                            ) + " of the background file has a stop coordinate that is not a number."
                            sys.exit(1)

                line_counter += 1

    # tests if the required inputs have been supplied
    if background_file_path == None:
        print >> sys.stderr, "Error: the background parameter is not in a valid format."
        sys.exit(1)

    print "validated the background parameter"

    return global_variables
Beispiel #6
0
def normexp_validation(norm_exp_parameter, sample_list):

    # required inputs
    norm_exp_file_path = None

    # gets the sub-parameters
    sub_params_list = norm_exp_parameter.split(",")

    # checks the sub params
    for sub_param in sub_params_list:

        # Tests if there are two parts to the sub-parameter
        if len(sub_param.split("=")) != 2:
            print >> sys.stderr, "Error: the normexp parameter is not in a valid format."
            sys.exit(1)

        # Tests if the expression threshold is in the correct format
        if sub_param.upper().startswith("expressed=".upper()):
            if not is_number(sub_param.split("=")[1]):
                print >> sys.stderr, "Error: the normexp parameter is not in a valid format."
                sys.exit(1)

        # Tests the file sub-parameter
        if sub_param.upper().startswith("file=".upper()):
            norm_exp_file_path = sub_param.split("=")[1]

            # Tests if the normexp file can be opened
            try:
                norm_exp_file = open(norm_exp_file_path).readlines()
            except:
                print >> sys.stderr, "Error: the normexp file: \"" + str(
                    norm_exp_file_path) + "\" cannot be opened."
                sys.exit(1)

            # checks the normexp file contents
            line_counter = 1
            for line in norm_exp_file:

                line_split = line.rstrip().split("\t")

                # Validates the header line (samples etc)
                if line_counter == 1:

                    if line_split[0].upper() != "ID":
                        print >> sys.stderr, "Error: the first column in normexp file is not called \"ID\"."
                        sys.exit(1)

                    samples = line_split[1:]

                    # Checks the diffexp and ss have the same number of samples:
                    if len(samples) > len(sample_list):
                        print >> sys.stderr, "Error: there are more samples in the normexp file than in the sample sheet."
                        sys.exit(1)
                    if len(samples) < len(sample_list):
                        print >> sys.stderr, "Error: there are fewer samples in the normexp file than in the sample sheet."
                        sys.exit(1)

                    # Checks the samples in the normexp are also in the ss
                    for sample in samples:
                        if not is_sample(sample.upper(), sample_list):
                            print >> sys.stderr, "Error: in the normexp file the sample " + sample.upper(
                            ) + " is not in the sample sheet."
                            sys.exit(1)

                else:
                    cells = line_split[1:]

                    for cell in cells:
                        if not is_number(cell):
                            print >> sys.stderr, "Error: the normexp file has a cell that is not a number at line " + str(
                                line_counter) + "."
                            sys.exit(1)

                line_counter += 1

    # tests if the required inputs have been supplied
    if norm_exp_file_path == None:
        print >> sys.stderr, "Error: the normexp parameter is not in a valid format."
        sys.exit(1)

    print "validated the normexp parameter"