Ejemplo n.º 1
0
def is_conserved(templine, headlist, yaml_commands):
    total = 0
    templinelist = templine.rstrip("\n").split("\t")

    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    colHeaders = yaml_utils.convertColumns(
        tiering_cmds[yaml_keys.kTConservationCols], yaml_commands)
    colThresholds = tiering_cmds[yaml_keys.kTConservationThresholds]
    thresh = tiering_cmds[yaml_keys.kTConservationGlobalThreshold]

    for idx, colHeader in enumerate(colHeaders):
        colThreshold = colThresholds[idx]
        col = templinelist[headlist.index(colHeader)]
        try:
            if (col != '' and
                (((type(colThreshold) is float or type(colThreshold) is int)
                  and float(col) >= colThreshold) or col == colThreshold)):
                total += 1
        except ValueError:
            print 'headlist ' + str(headlist)
            print 'templinelist ' + str(templinelist)
            print 'colHeader: ' + str(colHeader)
            print 'headlist index: ' + str(headlist.index(colHeader))
            print 'col: ' + str(col)
            raise
        #debug


#         elif(col == ''):
#             print 'warning: no value for '

    if total >= thresh:
        return 1  # True
    else:
        return 0  # False
Ejemplo n.º 2
0
def is_conserved(templine, headlist, yaml_commands):
    total = 0
    templinelist = templine.rstrip("\n").split("\t")
    
    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTConservationCols], yaml_commands)
    colThresholds = tiering_cmds[yaml_keys.kTConservationThresholds]
    thresh = tiering_cmds[yaml_keys.kTConservationGlobalThreshold]
    
    for idx,colHeader in enumerate(colHeaders):
        colThreshold = colThresholds[idx]
        col = templinelist[headlist.index(colHeader)]
        try:
            if(col != '' and
               (((type(colThreshold) is float or type(colThreshold) is int) and float(col) >= colThreshold)
                or col == colThreshold)
            ):
                total += 1
        except ValueError:
            print 'headlist ' + str(headlist)
            print 'templinelist ' + str(templinelist)
            print 'colHeader: ' + str(colHeader)
            print 'headlist index: ' + str(headlist.index(colHeader))
            print 'col: ' + str(col)
            raise
        #debug
#         elif(col == ''):
#             print 'warning: no value for '

    if total >= thresh:
        return 1 # True
    else:
        return 0 # False
Ejemplo n.º 3
0
def tolerance_pass(line, headlist, yaml_commands):
    lineList = line.rstrip("\n").split("\t")
    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    tolerance_zscore_colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTToleranceZScoreCols], yaml_commands)
    tolerance_zscore_cutoff = tiering_cmds[yaml_keys.kTToleranceZScoreCutoff]
    
    for zscore_col_header in tolerance_zscore_colHeaders:
        zscores = lineList[headlist.index(zscore_col_header)].split(yaml_commands[yaml_keys.kDDefaults][yaml_keys.kDMultimatchDelimiter]) # our delimiter
        for zscore in zscores:
            if(zscore != '' and float(zscore) > float(tolerance_zscore_cutoff)):
                return True
    #else
    return False
Ejemplo n.º 4
0
def tolerance_pass(line, headlist, yaml_commands):
    lineList = line.rstrip("\n").split("\t")
    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    tolerance_zscore_colHeaders = yaml_utils.convertColumns(
        tiering_cmds[yaml_keys.kTToleranceZScoreCols], yaml_commands)
    tolerance_zscore_cutoff = tiering_cmds[yaml_keys.kTToleranceZScoreCutoff]

    for zscore_col_header in tolerance_zscore_colHeaders:
        zscores = lineList[headlist.index(zscore_col_header)].split(
            yaml_utils.get_dataset_defaults(yaml_commands)[
                yaml_keys.kDMultimatchDelimiter])  # our delimiter
        for zscore in zscores:
            if (zscore != ''
                    and float(zscore) > float(tolerance_zscore_cutoff)):
                return True
    #else
    return False
Ejemplo n.º 5
0
def is_pathogenic(templine, headlist, yaml_commands):
    templinelist = templine.split("\t")
    pathogenic = 0

    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    nalg = tiering_cmds[yaml_keys.kTPathogenicityGlobalThreshold]

    colHeaders = yaml_utils.convertColumns(
        tiering_cmds[yaml_keys.kTPathogenicityCols], yaml_commands)
    colsThresholds = tiering_cmds[yaml_keys.kTPathogenicityThresholds]

    for idx, colHeader in enumerate(colHeaders):
        if (isinstance(colHeader, list)):
            passed = False
            colThresholdsList = colsThresholds[idx]
            for idx, singleColHeader in enumerate(colHeader):
                colThresholds = yaml_utils.split_multiple_col_thresholds(
                    colThresholdsList[idx], yaml_commands)
                col = templinelist[headlist.index(singleColHeader)]
                if (passes_criteria(col, colThresholds)):
                    passed = True
                    break
            if (passed):
                pathogenic += 1
            continue

        #else
        colThresholds = yaml_utils.split_multiple_col_thresholds(
            colsThresholds[idx], yaml_commands)
        col = templinelist[headlist.index(colHeader)]
        passed = passes_criteria(col, colThresholds)
        if (passed):
            pathogenic += 1

    if pathogenic >= int(nalg):
        #debug
        #         print 'is pathogenic'
        return 1
    else:
        #debug
        #         print 'not pathogenic'
        return 0
Ejemplo n.º 6
0
def is_pathogenic(templine, headlist, yaml_commands):
    templinelist = templine.split("\t")
    pathogenic = 0
    
    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    nalg = tiering_cmds[yaml_keys.kTPathogenicityGlobalThreshold]
    
    colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTPathogenicityCols], yaml_commands)
    colsThresholds = tiering_cmds[yaml_keys.kTPathogenicityThresholds]
    
    for idx,colHeader in enumerate(colHeaders):
        if(isinstance(colHeader, list)):
            passed = False
            colThresholdsList = colsThresholds[idx]
            for idx,singleColHeader in enumerate(colHeader):
                colThresholds = yaml_utils.split_multiple_col_thresholds(colThresholdsList[idx], yaml_commands)
                col = templinelist[headlist.index(singleColHeader)]
                if(passes_criteria(col, colThresholds)):
                    passed = True
                    break
            if(passed):
                pathogenic += 1
            continue

        #else
        colThresholds = yaml_utils.split_multiple_col_thresholds(colsThresholds[idx], yaml_commands)
        col = templinelist[headlist.index(colHeader)]
        passed = passes_criteria(col, colThresholds)
        if(passed):
            pathogenic += 1
            
    if pathogenic >= int(nalg):
        #debug
#         print 'is pathogenic'
        return 1
    else:
        #debug
#         print 'not pathogenic'
        return 0
Ejemplo n.º 7
0
def is_pathogenic(templine, headlist, yaml_commands):
    templinelist = templine.split("\t")
    pathogenic = 0
    
    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    nalg = tiering_cmds[yaml_keys.kTPathogenicityGlobalThreshold]
    
    colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTPathogenicityCols], yaml_commands)
    colsThresholds = tiering_cmds[yaml_keys.kTPathogenicityThresholds]
    
    for idx,colHeader in enumerate(colHeaders):
        if(isinstance(colHeader, list)):
            passed = False
            colThresholdsList = colsThresholds[idx]
            for idx,singleColHeader in enumerate(colHeader):
                colThresholds = yaml_utils.split_multiple_col_thresholds(colThresholdsList[idx], yaml_commands)
                col = templinelist[headlist.index(singleColHeader)]
                if(passes_criteria(col, colThresholds)):
                    passed = True
                    break
            if(passed):
                pathogenic += 1
            continue

        #else
#         if(isinstance(colsThresholds[idx], list)):
        colThresholds = yaml_utils.split_multiple_col_thresholds(colsThresholds[idx], yaml_commands)
        col = templinelist[headlist.index(colHeader)]
        passed = passes_criteria(col, colThresholds)
        if(passed):
            pathogenic += 1
            
#         passed = False
#         for colThreshold in colThresholds:
#             if(col != '' and
#                (((type(colThreshold) is float or type(colThreshold) is int) and float(col) >= colThreshold)
#                 or col == colThreshold)
#             ):
#                 passed = True
#                 break
#         if(passed):
#             pathogenic += 1
    
#     sift = templinelist[headlist.index(vcfHeaders.kSiftPred)] if vcfHeaders.kSiftPred in headlist else ''
#     pp2 = templinelist[headlist.index(vcfHeaders.kPolyphen2Pred)] if vcfHeaders.kPolyphen2Pred in headlist else ''
#     lrt = templinelist[headlist.index(vcfHeaders.kLrtPred)] if vcfHeaders.kLrtPred in headlist else ''
#     mt = templinelist[headlist.index(vcfHeaders.kMutationTasterPred)] if vcfHeaders.kMutationTasterPred in headlist else ''
#     pp2_2 = templinelist[headlist.index(vcfHeaders.kPolyphen2Pred_2)] if vcfHeaders.kPolyphen2Pred_2 in headlist else ''
#     
#     # check for header mismatches
#     if(vcfHeaders.kSiftPred not in headlist):
#         print 'warning: ' + vcfHeaders.kSiftPred + ' not found in annotated header'
#     if(vcfHeaders.kPolyphen2Pred not in headlist):
#         print 'warning: ' + vcfHeaders.kPolyphen2Pred + ' not found in annotated header'
#     if(vcfHeaders.kLrtPred not in headlist):
#         print 'warning: ' + vcfHeaders.kLrtPred + ' not found in annotated header'
#     if(vcfHeaders.kMutationTasterPred not in headlist):
#         print 'warning: ' + vcfHeaders.kMutationTasterPred + ' not found in annotated header'
#     if(vcfHeaders.kPolyphen2Pred_2 not in headlist):
#         print 'warning: ' + vcfHeaders.kPolyphen2Pred_2 + ' not found in annotated header (though currently unused)'
#     
#     if sift == "D":
#         pathogenic+=1
#         #debug
#         print 'SIFT=D'
#     if (pp2 == "P") or (pp2 == "D"):
#         pathogenic+=1
#         #debug 
#         print 'pp=P or D'
#     if lrt == "D":
#         pathogenic+=1
#         # debug
#         print 'lrt=D'
#     if (mt == "A") or (mt == "D"):
#         pathogenic+=1
#         #debug
#         print 'mt=A or D'
    if pathogenic >= int(nalg):
        #debug
#         print 'is pathogenic'
        return 1
    else:
        #debug
#         print 'not pathogenic'
        return 0
Ejemplo n.º 8
0
def is_conserved(templine, headlist, yaml_commands):
    total = 0
    templinelist = templine.rstrip("\n").split("\t")
    
    tiering_cmds = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering]
    colHeaders = yaml_utils.convertColumns(tiering_cmds[yaml_keys.kTConservationCols], yaml_commands)
    colThresholds = tiering_cmds[yaml_keys.kTConservationThresholds]
    thresh = tiering_cmds[yaml_keys.kTConservationGlobalThreshold]
    
    for idx,colHeader in enumerate(colHeaders):
        colThreshold = colThresholds[idx]
        col = templinelist[headlist.index(colHeader)]
        try:
            if(col != '' and
               (((type(colThreshold) is float or type(colThreshold) is int) and float(col) >= colThreshold)
                or col == colThreshold)
            ):
                total += 1
        except ValueError:
            print 'headlist ' + str(headlist)
            print 'templinelist ' + str(templinelist)
            print 'colHeader: ' + str(colHeader)
            print 'headlist index: ' + str(headlist.index(colHeader))
            print 'col: ' + str(col)
            raise
        #debug
#         elif(col == ''):
#             print 'warning: no value for '
    
#     phyloP = ''
#     if(vcfHeaders.kPhyloP_pred in headlist):
#         phyloP = templinelist[headlist.index(vcfHeaders.kPhyloP_pred)]
#     else:
#         print 'warning: ' + vcfHeaders.kPhyloP_pred + ' not found in annotated header'
#         
#     gerp = ''
#     if(vcfHeaders.kGerp in headlist):
#         gerp = templinelist[headlist.index(vcfHeaders.kGerp)]
#     else:
#         print 'warning: ' + vcfHeaders.kGerp + ' not found in annotated header'
#         
#     phast_cons = ''
#     if(vcfHeaders.kPhastConsElts46Way in headlist):
#         phast_cons = templinelist[headlist.index(vcfHeaders.kPhastConsElts46Way)]
#     else:
#         print 'warning: ' + vcfHeaders.kPhastConsElts46Way + ' not found in annotated header'
#     
#     wg_gerp = ''
#     if(vcfHeaders.kWg_gerp in headlist): 
#         wg_gerp = templinelist[headlist.index(vcfHeaders.kWg_gerp)]
#     else:
#         print 'warning: ' + vcfHeaders.kWg_gerp + ' not found in annotated header'
# 
#     if gerp != "": #and phyloP != "":
#         gerp = float(gerp)
#         if gerp >= 2.0:
#             total+=1
#     if phyloP == "C":
#         total+=1
#     if phast_cons!= "":
#         if int(phast_cons) >= 250:
#             total+=1
#     if wg_gerp != "":
#         if float(wg_gerp) >= 2.0:
#             total+=1
    if total >= thresh:
        return 1
    else:
        return 0
Ejemplo n.º 9
0
def preflight_checks(yaml_commands, db_conn, raise_exception=True):
    errors = []
    warnings = []
    c = db_conn.cursor()
    modules_yaml = yaml_commands[yaml_keys.kModules]

    # TODO:
    # - add more checks for any currently unchecked yaml keys, etc. in modules.yml and datasets.yml (e.g. throw an error if a yaml key like "Consensus_Columns" is missing)
    # - if there are any currently unchecked dataset column references in modules.yml, check those (cross-reference against datasets.yml by using yaml_utils.convertColumns() to ensure the appropriate columns can be found in datasets.yml)
    # - add checks for all external files (e.g. target gene lists for tiering). Warn/error if target gene list files are empty/missing.
    # - add checks for snpeff and annovar path
    # - check that summary column (if used, e.g. as gene name column for tiering) is specified with appropriate suffix (same as the suffix specified as Consensus_Column_Suffix)
    # - perhaps even do a simulated annotation run (just the header) and compare absolute headers in modules.yml against that, too

    # WARNING: currently not checking absolute columns (only columns specified with dot (.) notation)
    # 1) check downstream column references in modules.yml against datasets.yml
    # 1-1) check consensus columns
    consensus_cols_yaml = modules_yaml[yaml_keys.kAnnotation][
        yaml_keys.kAConsensusColumns]
    for consensus_col in consensus_cols_yaml:
        if consensus_col != yaml_keys.kAConsensusColumnsOrder:
            yaml_utils.convertColumns(
                consensus_cols_yaml[consensus_col], yaml_commands
            )  # will raise exception if can't find one of the columns with dot (.) notation
    # 1-2) check tiering columns
    tiering_cols_yaml = get_all_tiering_cols(yaml_commands)
    yaml_utils.convertColumns(
        tiering_cols_yaml, yaml_commands
    )  # will raise exception if can't find one of the columns with dot (.) notation

    # 2) check datasets.yml against database (and vice-versa)
    # 2-1) check table names to see if any are missing from db/datasets.yml
    datasets_yaml = yaml_utils.get_datasets(yaml_commands)
    #     dataset_annotation_names_dict = {}
    dataset_annotation_names = []
    dataset_names = []
    for dataset in datasets_yaml:
        dataset_names.append(dataset)
        dataset_annotation_names.append(
            datasets_yaml[dataset][yaml_keys.kDAnnotation])
#         dataset_annotation_names_dict[datasets_yaml[dataset][yaml_keys.kDAnnotation]] = 1
    datasets_db = db_utils.get_table_names(c)
    # check db against datasets.yml
    for dataset in datasets_db:
        if re.sub('_r$', '', dataset) not in dataset_annotation_names:
            warning = 'warning: ' + str(
                dataset) + ' in db but not datasets.yml (or commented out)'
            warnings.append(warning)
            print warning
    # check datasets.yml against db
    for idx, dataset in enumerate(dataset_annotation_names):
        dataset_yaml_name = dataset_names[idx]
        if (dataset not in datasets_db and dataset + '_r' not in datasets_db):
            if (datasets_yaml[dataset_yaml_name][yaml_keys.kDImportIfMissing]):
                error = 'error: ' + str(
                    dataset
                ) + ' in datasets.yml but not database. Run stmp.py --update_db to load it into the database, or comment it out (using # before each line) or change Import_If_Missing to False in datasets.yml to ignore it.'
                errors.append(error)
                print error
            elif (not datasets_yaml[dataset_yaml_name][
                    yaml_keys.kDImportIfMissing]):
                warning = 'warning: ' + str(
                    dataset
                ) + ' in datasets.yml but not database, and is currently not being imported (Import_If_Missing = False). If desired, change Import_If_Missing to True for this dataset in datasets.yml and run stmp.py --update_db to load it into the database. Otherwise this dataset will not be used for annotations.'
                warnings.append(warning)
                print warning

    # 2-2) check dataset columns in yaml vs db (both directions)
    # WARNING: we do not currently check the delimiter for multiple matches, if stored in the db (specified in the defaults section of datasets.yml)
    for dataset in datasets_yaml:
        dataset_annotated_name = datasets_yaml[dataset][yaml_keys.kDAnnotation]
        dataset_db_name = ''
        if (dataset_annotated_name in datasets_db):
            dataset_db_name = dataset_annotated_name
        elif (dataset_annotated_name + '_r' in datasets_db):
            dataset_db_name = dataset_annotated_name + '_r'
        else:  # dataset not found in db (which means we already gave an error/warning above)
            continue
        if (db_utils.is_region_dataset(dataset_db_name) !=
                yaml_utils.is_region_dataset(dataset, yaml_commands)):
            error = 'error: dataset ' + str(
                dataset_db_name
            ) + ' in db does not match with ' + str(
                dataset
            ) + ' in datasets.yml. One is specified as region and the other is specified as point. Either change datasets.yml or remove and reimport the dataset into the database.'
            errors.append(error)
            print error
        # 2-2a) check column names in yaml vs db
        yaml_cols = datasets_yaml[dataset][yaml_keys.kDColumnHeaders]
        db_cols = db_utils.getColumnNamesOfTable(c, dataset_db_name)
        for col in yaml_cols:
            if (col != '' and col not in db_cols
                    and col not in stmp_annotation_util.START_COLUMN_HEADERS
                    and col not in stmp_annotation_util.STOP_COLUMN_HEADERS
                    and col not in stmp_annotation_util.CHR_HEADERS):
                # TODO check for whether col is start, stop, or chr col and then check whether these cols are in the imported table with standard names (for true db integrity check)
                error = 'error: column ' + str(col) + ' of dataset ' + str(
                    dataset) + ' found in datasets.yml but not database'
                errors.append(error)
                print error
        # 2-2b) check column names in db vs yaml
        for col in db_cols:
            if (col not in yaml_cols and col.lower()
                    not in stmp_annotation_util.START_COLUMN_HEADERS and
                    col.lower() not in stmp_annotation_util.STOP_COLUMN_HEADERS
                    and col.lower() not in stmp_annotation_util.CHR_HEADERS
                    and col.lower() not in vcfHeaders.kVCFColTypes
                    and col != vcfHeaders.kClinvarStarHeader):
                error = 'error: column ' + str(col) + ' of dataset ' + str(
                    dataset
                ) + ' found in database but not datasets.yml (and is not a standard VCF column, clinvar star rating, chr, start, or stop column)'
                errors.append(error)
                print error

    # (End of checks) summarize with all the errors, warnings, etc.
    err_warning_summary = 'Warnings:\n' + '\n'.join(
        warnings) + '\n\n' + 'Errors:\n' + "\n".join(errors) + '\n\n' + str(
            len(warnings)) + ' warnings and ' + str(
                len(errors)) + ' errors in preflight checks.'
    if (len(errors) > 0):
        if (raise_exception):
            raise ValueError(
                err_warning_summary +
                ' Please correct the errors and then re-run STMP.')
        #else
        return False
    #else
    print "\n" + err_warning_summary
    print '+(OK) passed pre-flight checks\n'
    return True
Ejemplo n.º 10
0
def preflight_checks(yaml_commands, db_conn, raise_exception=True):
    errors = []
    warnings = []
    c = db_conn.cursor()
    modules_yaml = yaml_commands[yaml_keys.kModules]
    
    # TODO: 
    # - add more checks for any currently unchecked yaml keys, etc. in modules.yml and datasets.yml (e.g. throw an error if a yaml key like "Consensus_Columns" is missing) 
    # - if there are any currently unchecked dataset column references in modules.yml, check those (cross-reference against datasets.yml by using yaml_utils.convertColumns() to ensure the appropriate columns can be found in datasets.yml)
    # - add checks for all external files (e.g. target gene lists for tiering). Warn/error if target gene list files are empty/missing.
    # - add checks for snpeff and annovar path
    # - check that summary column (if used, e.g. as gene name column for tiering) is specified with appropriate suffix (same as the suffix specified as Consensus_Column_Suffix)
    # - perhaps even do a simulated annotation run (just the header) and compare absolute headers in modules.yml against that, too
    
    # WARNING: currently not checking absolute columns (only columns specified with dot (.) notation)
    # 1) check downstream column references in modules.yml against datasets.yml
    # 1-1) check consensus columns
    consensus_cols_yaml = modules_yaml[yaml_keys.kAnnotation][yaml_keys.kAConsensusColumns]
    for consensus_col in consensus_cols_yaml:
        if consensus_col != yaml_keys.kAConsensusColumnsOrder:
            yaml_utils.convertColumns(consensus_cols_yaml[consensus_col], yaml_commands) # will raise exception if can't find one of the columns with dot (.) notation
    # 1-2) check tiering columns
    tiering_cols_yaml = get_all_tiering_cols(yaml_commands)
    yaml_utils.convertColumns(tiering_cols_yaml, yaml_commands) # will raise exception if can't find one of the columns with dot (.) notation
    
    # 2) check datasets.yml against database (and vice-versa)
    # 2-1) check table names to see if any are missing from db/datasets.yml
    datasets_yaml = yaml_utils.get_datasets(yaml_commands)
#     dataset_annotation_names_dict = {}
    dataset_annotation_names = []
    dataset_names = []
    for dataset in datasets_yaml:
        dataset_names.append(dataset)
        dataset_annotation_names.append(datasets_yaml[dataset][yaml_keys.kDAnnotation])
#         dataset_annotation_names_dict[datasets_yaml[dataset][yaml_keys.kDAnnotation]] = 1
    datasets_db = db_utils.get_table_names(c)
    # check db against datasets.yml
    for dataset in datasets_db:
        if re.sub('_r$', '', dataset) not in dataset_annotation_names:
            warning = 'warning: ' + str(dataset) + ' in db but not datasets.yml (or commented out)'
            warnings.append(warning)
            print warning
    # check datasets.yml against db
    for idx,dataset in enumerate(dataset_annotation_names):
        dataset_yaml_name = dataset_names[idx]
        if(dataset not in datasets_db and dataset+'_r' not in datasets_db):
            if(datasets_yaml[dataset_yaml_name][yaml_keys.kDImportIfMissing]):
                error = 'error: ' + str(dataset) + ' in datasets.yml but not database. Run stmp.py --update_db to load it into the database, or comment it out (using # before each line) or change Import_If_Missing to False in datasets.yml to ignore it.'
                errors.append(error)
                print error
            elif(not datasets_yaml[dataset_yaml_name][yaml_keys.kDImportIfMissing]):
                warning = 'warning: ' + str(dataset) + ' in datasets.yml but not database, and is currently not being imported (Import_If_Missing = False). If desired, change Import_If_Missing to True for this dataset in datasets.yml and run stmp.py --update_db to load it into the database. Otherwise this dataset will not be used for annotations.'
                warnings.append(warning)
                print warning
        
    # 2-2) check dataset columns in yaml vs db (both directions)
    # WARNING: we do not currently check the delimiter for multiple matches, if stored in the db (specified in the defaults section of datasets.yml)
    for dataset in datasets_yaml:
        dataset_annotated_name = datasets_yaml[dataset][yaml_keys.kDAnnotation]
        dataset_db_name = ''
        if(dataset_annotated_name in datasets_db):
            dataset_db_name = dataset_annotated_name
        elif(dataset_annotated_name+'_r' in datasets_db):
            dataset_db_name=dataset_annotated_name+'_r'
        else: # dataset not found in db (which means we already gave an error/warning above)
            continue
        if(db_utils.is_region_dataset(dataset_db_name) != yaml_utils.is_region_dataset(dataset, yaml_commands)):
            error = 'error: dataset ' + str(dataset_db_name) + ' in db does not match with ' + str(dataset) + ' in datasets.yml. One is specified as region and the other is specified as point. Either change datasets.yml or remove and reimport the dataset into the database.'
            errors.append(error)
            print error
        # 2-2a) check column names in yaml vs db
        yaml_cols = datasets_yaml[dataset][yaml_keys.kDColumnHeaders]
        db_cols = db_utils.getColumnNamesOfTable(c, dataset_db_name)
        for col in yaml_cols:
            if(col != '' and col not in db_cols 
               and col not in stmp_annotation_util.START_COLUMN_HEADERS and col not in stmp_annotation_util.STOP_COLUMN_HEADERS and col not in stmp_annotation_util.CHR_HEADERS):
                # TODO check for whether col is start, stop, or chr col and then check whether these cols are in the imported table with standard names (for true db integrity check)
                error = 'error: column ' + str(col) + ' of dataset ' + str(dataset) + ' found in datasets.yml but not database'
                errors.append(error)
                print error
        # 2-2b) check column names in db vs yaml
        for col in db_cols:
            if(col not in yaml_cols 
               and col.lower() not in stmp_annotation_util.START_COLUMN_HEADERS and col.lower() not in stmp_annotation_util.STOP_COLUMN_HEADERS and col.lower() not in stmp_annotation_util.CHR_HEADERS 
               and col.lower() not in vcfHeaders.kVCFColTypes 
               and col != vcfHeaders.kClinvarStarHeader):
                error = 'error: column ' + str(col) + ' of dataset ' + str(dataset) + ' found in database but not datasets.yml (and is not a standard VCF column, clinvar star rating, chr, start, or stop column)'
                errors.append(error)
                print error
    
    # (End of checks) summarize with all the errors, warnings, etc.
    err_warning_summary = 'Warnings:\n' + '\n'.join(warnings) + '\n\n' + 'Errors:\n' + "\n".join(errors) + '\n\n' + str(len(warnings)) + ' warnings and ' + str(len(errors)) + ' errors in preflight checks.'
    if(len(errors) > 0):
        if(raise_exception):
            raise ValueError(err_warning_summary + ' Please correct the errors and then re-run STMP.')
        #else
        return False
    #else
    print "\n" + err_warning_summary
    print '+(OK) passed pre-flight checks\n'
    return True
Ejemplo n.º 11
0
def tiers_allvars(in_file, out_stem, gene_file, pop, yaml_cmds):
    # populate parameters from YAML module specifications
    freq = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTRareAlleleFreqCutoff]
    gene_name_col_header = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTGeneNameCol]
    functional_column_headers = yaml_utils.convertColumns(yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTFunctionalCols], yaml_cmds)
    skip_filter_pass_check = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTSkipFilterPassCheck]
    
    # check for whether any variant contains "PASS"
    cmd = 'grep "PASS" {infile}|grep -v "#"|wc -l'.format(infile=in_file)
    process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout,stderr = process.communicate()
    returncode = process.returncode
    if(returncode != 0):
        raise ValueError('Failed to search annotated VCF for "PASS" prior to tiering.\ncmd: ' + str(cmd) + '\nErr: ' + str(stderr) + '\nReturncode: ' + returncode + '\nOutput: ' + str(stdout))
    #else
    numHits = int(stdout)
    # TODO move this to pre-checks (prior to even annotation)
    if(numHits == 0 and not skip_filter_pass_check):
        raise ValueError('No variants detected that passed filtering. Re-run STMP with "Skip_Filter_Pass_Check: True" in modules.yml to prioritize all variants anyway.')
    elif(numHits == 0):
        print 'NOTICE: no variants detected that passed filtering. Skipping filter PASS check and prioritizing all variants anyway.'
    else:
        print 'Found ' + str(numHits) + ' variants that passed filtering.' + ' Tiering just these variants.'
        skip_filter_pass_check = False
    
    #open input and output files and initialize counters and lists for background populations
    filein = open(in_file, "r")
    output_log = open(out_stem+".metrics", "w")
    output_log.write("Metrics for stmp filtering, all variants from reference\n")
    header = filein.readline().rstrip("\n")
    headlist = header.split("\t")
    if(gene_file != None):
        g_file = open(gene_file, "r")
    fileoutrare = open(out_stem+'.rare.txt', 'w')
    fileout0 = open(out_stem+".tier0.txt", 'w')
    fileout1 = open(out_stem+".tier1.txt", "w")
    fileout2 = open(out_stem+".tier2.txt", "w")
    fileout3 = open(out_stem+".tier3.txt", "w")
    fileout4 = open(out_stem+".tier4.txt", "w")
    fileoutrare.write(header+"\n")
    fileout0.write("tier\t"+header + "\n")
    fileout1.write("tier\t"+header + "\n")
    fileout2.write("tier\t"+header + "\n")
    fileout3.write("tier\t"+header + "\n")
    fileout4.write("tier\t"+header + "\n")
    total = 0
    damaging0 = 0
    damaging1 = 0
    damaging2 = 0
    damaging3 = 0
    damaging4 = 0
    target_genes = 0
    rarevars = 0
    
    allele_freq_cols = yaml_utils.convertColumns(yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTAlleleFreqCols], yaml_cmds) #convertTieringColumns(yaml_cmds)
    #debug
    print 'allele freq cols: ' + str(allele_freq_cols)
    
    backpoplist = vcfUtils.get_listindex(headlist, allele_freq_cols)
    #debug
#     print 'backpoplist: ' + str(backpoplist)

    #initialize gene list for region prioritization
    if(gene_file != None):
        genes = {}
        for line in g_file:
            if line.startswith('#'):
                continue
            linelist = line.rstrip("\n").split("\t")
            gene = linelist[0]
            
            if not genes.has_key(gene):
                genes[gene] = 1
            else:
                # debug: uncomment if not debugging
#                 print 'warning: duplicate gene ' + gene + ' in gene list ' + gene_file
                None
    
    #iterate over input file and parse into tiers
    for line in filein:
        total+=1
        if ((skip_filter_pass_check or "PASS" in line) and ("#" in line) == 0 and vcfUtils.is_rare(line, freq, backpoplist, yaml_cmds)
            and not vcfUtils.contains_text('MT', line, [stmp_consts.vcf_col_header_chrom], headlist, yaml_cmds, case_sensitive=False)
            and not vcfUtils.contains_text('ncRNA', line, functional_column_headers, headlist, yaml_cmds, case_sensitive=True)
            ):
            rarevars+=1
            fileoutrare.write(line)
            linelist = line.rstrip("\n").split("\t")
            # for now
            tmp = linelist[headlist.index(gene_name_col_header)].split(',')
            gene = tmp[0]

            if gene_file == None or genes.has_key(gene):
                target_genes+=1
                # tier 0: clinvar
                if(vcfUtils.isClinvarPathogenicOrLikelyPathogenic(line, headlist, yaml_cmds) and not vcfUtils.contains_text('0', line, [yaml_utils.get_datasets(yaml_cmds)['clinvar'][yaml_keys.kDAnnotation]+'_'+vcfHeaders.kClinvarStarHeader], headlist, yaml_cmds, case_sensitive=False)):
                    fileout0.write("0\t"+line)
                    damaging0+=1
                # tier 1
                elif vcfUtils.is_functional(line, "stoploss stopgain splicing frameshift", functional_column_headers, headlist):
                    fileout1.write("1\t"+line)
                    damaging1+=1
                # tier 2
                elif ((vcfUtils.is_functional(line, "nonsynonymous", functional_column_headers, headlist) and vcfUtils.is_conserved(line, headlist, yaml_cmds)) or vcfUtils.is_functional(line, "nonframeshift", functional_column_headers, headlist)):
                    fileout2.write("2\t"+line)
                    damaging2+=1
                # tier 3
                elif vcfUtils.is_functional(line, "nonsynonymous", functional_column_headers, headlist) and vcfUtils.is_pathogenic(line, headlist, yaml_cmds):
                    fileout3.write("3\t"+line)
                    damaging3+=1
                # tier 4
                elif vcfUtils.tolerance_pass(line, headlist, yaml_cmds) and vcfUtils.is_functional(line, "exonic splicing", functional_column_headers, headlist):
                    fileout4.write("4\t"+line)
                    damaging4+=1
                # else ignore variant

    output_log.write("Total variants queried: "+str(total)+"\n")
    output_log.write("Rare variants (allele freq < {freq}) queried: ".format(freq=str(freq))+str(rarevars)+"\n")
    output_log.write("Rare variants in {num} target genes: ".format(num=str(len(genes)) if gene_file != None else '')+str(target_genes)+"\n")
    output_log.write("Candidate variants, tier 0 (rare clinvar pathogenic or likely pathogenic variants with rating > 0 stars): "+str(damaging0)+"\n")
    output_log.write("Candidate variants, tier 1 (rare LOF variants -- stoploss, stopgain, splicing, and frameshift): "+str(damaging1)+"\n")
    output_log.write("Candidate variants, tier 2 (rare nonframeshift or (nonsynonymous and conserved) variants): "+str(damaging2)+"\n")
    output_log.write("Candidate variants, tier 3 (rare nonsynonymous pathogenic variants): "+str(damaging3)+"\n")
    output_log.write("Candidate variants, tier 4 (all other rare exonic/splicing variants with ExAC tolerance z-score (syn_z or mis_z or lof_z) > 2): "+str(damaging4)+"\n")

    filein.close()
    if(gene_file != None):
        g_file.close()
    fileoutrare.close()
    fileout0.close()
    fileout1.close()
    fileout2.close()
    fileout3.close()
    fileout4.close()
Ejemplo n.º 12
0
def tiers_allvars(in_file, out_stem, gene_file, pop, yaml_cmds):
    # populate parameters from YAML module specifications
    freq = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTRareAlleleFreqCutoff]
    gene_name_col_header = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTGeneNameCol]
    functional_column_headers = yaml_utils.convertColumns(yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTFunctionalCols], yaml_cmds)
    
    #open input and output files and initialize counters and lists for background populations
    filein = open(in_file, "r")
    output_log = open(out_stem+".metrics", "w")
    output_log.write("Metrics for stmp filtering, all variants from reference\n")
    header = filein.readline().rstrip("\n")
    headlist = header.split("\t")
    if(gene_file != None):
        g_file = open(gene_file, "r")
    fileoutrare = open(out_stem+'.rare.txt', 'w')
    fileout0 = open(out_stem+".tier0.txt", 'w')
    fileout1 = open(out_stem+".tier1.txt", "w")
    fileout2 = open(out_stem+".tier2.txt", "w")
    fileout3 = open(out_stem+".tier3.txt", "w")
    fileout4 = open(out_stem+".tier4.txt", "w")
    fileoutrare.write(header+"\n")
    fileout0.write("tier\t"+header + "\n")
    fileout1.write("tier\t"+header + "\n")
    fileout2.write("tier\t"+header + "\n")
    fileout3.write("tier\t"+header + "\n")
    fileout4.write("tier\t"+header + "\n")
    total = 0
    damaging0 = 0
    damaging1 = 0
    damaging2 = 0
    damaging3 = 0
    damaging4 = 0
    target_genes = 0
    rarevars = 0
    
    allele_freq_cols = yaml_utils.convertColumns(yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTAlleleFreqCols], yaml_cmds) #convertTieringColumns(yaml_cmds)
    
    backpoplist = vcfUtils.get_listindex(headlist, allele_freq_cols)

    #initialize gene list for region prioritization
    if(gene_file != None):
        genes = {}
        for line in g_file:
            if line.startswith('#'):
                continue
            linelist = line.rstrip("\n").split("\t")
            gene = linelist[0]
            
            if not genes.has_key(gene):
                genes[gene] = 1
            else:
                # debug: uncomment if not debugging
#                 print 'warning: duplicate gene ' + gene + ' in gene list ' + gene_file
                None
    
    #iterate over input file and parse into tiers
    for line in filein:
        total+=1
        if (("PASS" in line) and ("#" in line) == 0 and vcfUtils.is_rare(line, freq, backpoplist)
            and not vcfUtils.contains_text('MT', line, [stmp_consts.vcf_col_header_chrom], headlist, yaml_cmds, case_sensitive=False)
            and not vcfUtils.contains_text('ncRNA', line, functional_column_headers, headlist, yaml_cmds, case_sensitive=True)
            ):
            rarevars+=1
            fileoutrare.write(line)
            linelist = line.rstrip("\n").split("\t")
            # for now
            tmp = linelist[headlist.index(gene_name_col_header)].split(',')
            gene = tmp[0]

            if gene_file == None or genes.has_key(gene):
                target_genes+=1
                # tier 0: clinvar
                if(vcfUtils.isClinvarPathogenicOrLikelyPathogenic(line, headlist, yaml_cmds) and not vcfUtils.contains_text('0', line, [yaml_cmds['clinvar'][yaml_keys.kDAnnotation]+'_'+vcfHeaders.kClinvarStarHeader], headlist, yaml_cmds, case_sensitive=False)):
                    fileout0.write("0\t"+line)
                    damaging0+=1
                elif vcfUtils.is_functional(line, "stoploss stopgain splicing frameshift", functional_column_headers, headlist):
                    fileout1.write("1\t"+line)
                    damaging1+=1
                elif ((vcfUtils.is_functional(line, "nonsynonymous", functional_column_headers, headlist) and vcfUtils.is_conserved(line, headlist, yaml_cmds)) or vcfUtils.is_functional(line, "nonframeshift", functional_column_headers, headlist)):
                    fileout2.write("2\t"+line)
                    damaging2+=1
                elif vcfUtils.is_functional(line, "nonsynonymous", functional_column_headers, headlist) and vcfUtils.is_pathogenic(line, headlist, yaml_cmds):
                    fileout3.write("3\t"+line)
                    damaging3+=1
                elif vcfUtils.tolerance_pass(line, headlist, yaml_cmds):
                    fileout4.write("4\t"+line)
                    damaging4+=1
                # else ignore variant

    output_log.write("Total variants queried: "+str(total)+"\n")
    output_log.write("Rare variants (allele freq < {freq}) queried: ".format(freq=str(freq))+str(rarevars)+"\n")
    output_log.write("Rare variants in {num} target genes: ".format(num=str(len(genes)) if gene_file != None else '')+str(target_genes)+"\n")
    output_log.write("Candidate variants, tier 0 (rare clinvar pathogenic or likely pathogenic variants): "+str(damaging0)+"\n")
    output_log.write("Candidate variants, tier 1 (rare LOF variants -- stoploss, stopgain, splicing, and frameshift): "+str(damaging1)+"\n")
    output_log.write("Candidate variants, tier 2 (rare nonframeshift or (nonsynonymous and conserved) variants): "+str(damaging2)+"\n")
    output_log.write("Candidate variants, tier 3 (rare nonsynonymous pathogenic variants): "+str(damaging3)+"\n")
    output_log.write("Candidate variants, tier 4 (all other rare variants with ExAC tolerance z-score (syn_z or mis_z or lof_z) > 2): "+str(damaging4)+"\n")

    filein.close()
    if(gene_file != None):
        g_file.close()
    fileoutrare.close()
    fileout0.close()
    fileout1.close()
    fileout2.close()
    fileout3.close()
    fileout4.close()
Ejemplo n.º 13
0
def tiers_allvars(in_file, out_stem, gene_file, pop, yaml_cmds):
    # populate parameters from YAML module specifications
    freq = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][
        yaml_keys.kTRareAlleleFreqCutoff]
    gene_name_col_header = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][
        yaml_keys.kTGeneNameCol]
    functional_column_headers = yaml_utils.convertColumns(
        yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][
            yaml_keys.kTFunctionalCols], yaml_cmds)
    skip_filter_pass_check = yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][
        yaml_keys.kTSkipFilterPassCheck]

    # check for whether any variant contains "PASS"
    cmd = 'grep "PASS" {infile}|grep -v "#"|wc -l'.format(infile=in_file)
    process = subprocess.Popen(cmd,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    returncode = process.returncode
    if (returncode != 0):
        raise ValueError(
            'Failed to search annotated VCF for "PASS" prior to tiering.\ncmd: '
            + str(cmd) + '\nErr: ' + str(stderr) + '\nReturncode: ' +
            returncode + '\nOutput: ' + str(stdout))
    #else
    numHits = int(stdout)
    # TODO move this to pre-checks (prior to even annotation)
    if (numHits == 0 and not skip_filter_pass_check):
        raise ValueError(
            'No variants detected that passed filtering. Re-run STMP with "Skip_Filter_Pass_Check: True" in modules.yml to prioritize all variants anyway.'
        )
    elif (numHits == 0):
        print 'NOTICE: no variants detected that passed filtering. Skipping filter PASS check and prioritizing all variants anyway.'
    else:
        print 'Found ' + str(
            numHits
        ) + ' variants that passed filtering.' + ' Tiering just these variants.'
        skip_filter_pass_check = False

    #open input and output files and initialize counters and lists for background populations
    filein = open(in_file, "r")
    output_log = open(out_stem + ".metrics", "w")
    output_log.write(
        "Metrics for stmp filtering, all variants from reference\n")
    header = filein.readline().rstrip("\n")
    headlist = header.split("\t")
    if (gene_file != None):
        g_file = open(gene_file, "r")
    fileoutrare = open(out_stem + '.rare.txt', 'w')
    fileout0 = open(out_stem + ".tier0.txt", 'w')
    fileout1 = open(out_stem + ".tier1.txt", "w")
    fileout2 = open(out_stem + ".tier2.txt", "w")
    fileout3 = open(out_stem + ".tier3.txt", "w")
    fileout4 = open(out_stem + ".tier4.txt", "w")
    fileoutrare.write(header + "\n")
    fileout0.write("tier\t" + header + "\n")
    fileout1.write("tier\t" + header + "\n")
    fileout2.write("tier\t" + header + "\n")
    fileout3.write("tier\t" + header + "\n")
    fileout4.write("tier\t" + header + "\n")
    total = 0
    damaging0 = 0
    damaging1 = 0
    damaging2 = 0
    damaging3 = 0
    damaging4 = 0
    target_genes = 0
    rarevars = 0

    allele_freq_cols = yaml_utils.convertColumns(
        yaml_cmds[yaml_keys.kModules][yaml_keys.kTiering][
            yaml_keys.kTAlleleFreqCols],
        yaml_cmds)  #convertTieringColumns(yaml_cmds)
    #debug
    print 'allele freq cols: ' + str(allele_freq_cols)

    backpoplist = vcfUtils.get_listindex(headlist, allele_freq_cols)
    #debug
    #     print 'backpoplist: ' + str(backpoplist)

    #initialize gene list for region prioritization
    if (gene_file != None):
        genes = {}
        for line in g_file:
            if line.startswith('#'):
                continue
            linelist = line.rstrip("\n").split("\t")
            gene = linelist[0]

            if not genes.has_key(gene):
                genes[gene] = 1
            else:
                # debug: uncomment if not debugging
                #                 print 'warning: duplicate gene ' + gene + ' in gene list ' + gene_file
                None

    #iterate over input file and parse into tiers
    for line in filein:
        total += 1
        if ((skip_filter_pass_check or "PASS" in line) and ("#" in line) == 0
                and vcfUtils.is_rare(line, freq, backpoplist, yaml_cmds)
                and not vcfUtils.contains_text(
                    'MT',
                    line, [stmp_consts.vcf_col_header_chrom],
                    headlist,
                    yaml_cmds,
                    case_sensitive=False)
                and not vcfUtils.contains_text('ncRNA',
                                               line,
                                               functional_column_headers,
                                               headlist,
                                               yaml_cmds,
                                               case_sensitive=True)):
            rarevars += 1
            fileoutrare.write(line)
            linelist = line.rstrip("\n").split("\t")
            # for now
            tmp = linelist[headlist.index(gene_name_col_header)].split(',')
            gene = tmp[0]

            if gene_file == None or genes.has_key(gene):
                target_genes += 1
                # tier 0: clinvar
                if (vcfUtils.isClinvarPathogenicOrLikelyPathogenic(
                        line, headlist, yaml_cmds)
                        and not vcfUtils.contains_text(
                            '0',
                            line, [
                                yaml_utils.get_datasets(yaml_cmds)['clinvar'][
                                    yaml_keys.kDAnnotation] + '_' +
                                vcfHeaders.kClinvarStarHeader
                            ],
                            headlist,
                            yaml_cmds,
                            case_sensitive=False)):
                    fileout0.write("0\t" + line)
                    damaging0 += 1
                # tier 1
                elif vcfUtils.is_functional(
                        line, "stoploss stopgain splicing frameshift",
                        functional_column_headers, headlist):
                    fileout1.write("1\t" + line)
                    damaging1 += 1
                # tier 2
                elif ((vcfUtils.is_functional(line, "nonsynonymous",
                                              functional_column_headers,
                                              headlist)
                       and vcfUtils.is_conserved(line, headlist, yaml_cmds))
                      or vcfUtils.is_functional(line, "nonframeshift",
                                                functional_column_headers,
                                                headlist)):
                    fileout2.write("2\t" + line)
                    damaging2 += 1
                # tier 3
                elif vcfUtils.is_functional(
                        line, "nonsynonymous", functional_column_headers,
                        headlist) and vcfUtils.is_pathogenic(
                            line, headlist, yaml_cmds):
                    fileout3.write("3\t" + line)
                    damaging3 += 1
                # tier 4
                elif vcfUtils.tolerance_pass(
                        line, headlist, yaml_cmds) and vcfUtils.is_functional(
                            line, "exonic splicing", functional_column_headers,
                            headlist):
                    fileout4.write("4\t" + line)
                    damaging4 += 1
                # else ignore variant

    output_log.write("Total variants queried: " + str(total) + "\n")
    output_log.write("Rare variants (allele freq < {freq}) queried: ".format(
        freq=str(freq)) + str(rarevars) + "\n")
    output_log.write("Rare variants in {num} target genes: ".format(
        num=str(len(genes)) if gene_file != None else '') + str(target_genes) +
                     "\n")
    output_log.write(
        "Candidate variants, tier 0 (rare clinvar pathogenic or likely pathogenic variants with rating > 0 stars): "
        + str(damaging0) + "\n")
    output_log.write(
        "Candidate variants, tier 1 (rare LOF variants -- stoploss, stopgain, splicing, and frameshift): "
        + str(damaging1) + "\n")
    output_log.write(
        "Candidate variants, tier 2 (rare nonframeshift or (nonsynonymous and conserved) variants): "
        + str(damaging2) + "\n")
    output_log.write(
        "Candidate variants, tier 3 (rare nonsynonymous pathogenic variants): "
        + str(damaging3) + "\n")
    output_log.write(
        "Candidate variants, tier 4 (all other rare exonic/splicing variants with ExAC tolerance z-score (syn_z or mis_z or lof_z) > 2): "
        + str(damaging4) + "\n")

    filein.close()
    if (gene_file != None):
        g_file.close()
    fileoutrare.close()
    fileout0.close()
    fileout1.close()
    fileout2.close()
    fileout3.close()
    fileout4.close()