Esempio n. 1
0
def gen_dcna(exports_config: Config.Config, study_config: Config.Config, verb):

    # This is dCNA
    # Requires cCNA to be generated already
    helper.working_on(verb, message='Gathering files ...')
    l_o_file = os.path.join(
        study_config.config_map['output_folder'], 'data_{}.txt'.format(
            constants.config2name_map['CONTINUOUS_COPY_NUMBER']))
    c_o_file = os.path.join(
        study_config.config_map['output_folder'], 'data_{}.txt'.format(
            constants.config2name_map[exports_config.type_config]))
    global thresholds
    thresholds = [
        float(x) for x in exports_config.config_map['thresholds'].split(',')
    ]
    if os.path.exists(l_o_file):
        helper.working_on(verb, message='Generating dCNA (CNA)...')
        data = pd.read_csv(l_o_file, sep='\t')
        cols = data.columns.values.tolist()[1:]

        # This code here had an astonishing 5500x improvement compared to traversal over it as a 2D array, and yes 5500x
        for c in cols:
            data[c] = data[c].apply(lambda x: collapse(x))

        data.to_csv(c_o_file, sep='\t', index=None)
    else:
        print(
            'ERROR:: Cannot generate dCNA file because log2CNA file does not exist ...'
        )
        print(
            'ERROR:: Either remove the DISCRETE data config file, or add a CONTINUOUS data config file '
        )
        helper.stars()
        helper.stars()
        exit(1)
Esempio n. 2
0
def gen_log2cna(exports_config: Config.Config, study_config: Config.Config,
                janus_path, verb):
    # TODO janus_path argument is not used, can remove; replace verb with logger
    helper.working_on(verb, message='Gathering files ...')
    seg_file = os.path.join(
        study_config.config_map['output_folder'],
        'data_{}.txt'.format(constants.config2name_map['SEG']))
    bed_file = exports_config.config_map['bed_file']
    l_o_file = os.path.join(
        study_config.config_map['output_folder'], 'data_{}.txt'.format(
            constants.config2name_map[exports_config.type_config]))

    helper.working_on(verb, message='Generating log2CNA...')

    executable = 'Rscript'
    r_script_path = os.path.join(os.dirname(__file__), R_SCRIPT_DIRNAME,
                                 'seg2gene.r')
    if os.path.exists(r_script_path):
        cmd = ', '.join(
            [executable, r_script_path, seg_file, bed_file, l_o_file])
        logger.debug('Running R script command: ' + cmd)
        rc = subprocess.call(cmd)
        if rc != 0:
            msg = "Non-zero exit code %i from R script command '%s'" % (rc,
                                                                        cmd)
            raise ValueError(msg)
    else:
        raise FileNotFoundError(
            'Cannot find R script path {}'.format(r_script_path))
def main():
    global meta_config
    global study_config
    global janus_path
    global verb

    helper.working_on(verb, message='Generating CNA files ...')
    discrete_copy_number_data.gen_dcna(meta_config, study_config, verb)
    helper.working_on(verb)
Esempio n. 4
0
def main():
    global meta_config
    global study_config
    global janus_path
    global verb

    helper.working_on(verb, message='Generating log2CNA files ...')
    continuous_copy_number_data.gen_log2cna(meta_config, study_config,
                                            janus_path, verb)
    helper.working_on(verb)
Esempio n. 5
0
def get_sample_ids(exports_config: Config.Config, verb) -> pd.Series:
    data = pd.read_csv(os.path.join(exports_config.config_map['input_folder'],
                                    exports_config.data_frame['FILE_NAME'][0]),
                       sep='\t',
                       usecols=['ID'])

    helper.working_on(verb,
                      message='Parsing importable {} file ...'.format(
                          exports_config.type_config))
    return data['ID'].drop_duplicates(keep='first', inplace=False)
Esempio n. 6
0
def fix_hmmcopy_tsv(exports_config: Config.Config, study_config: Config.Config,
                    verb):
    # Fix the header
    # Gather ingredients
    calls = []
    output_folder = study_config.config_map['output_folder']
    input_folder = exports_config.config_map['input_folder']
    export_data = exports_config.data_frame
    #input(export_data)
    seg_temp = helper.get_temp_folder(output_folder, 'seg')

    bed_filter = subprocess.check_output([
        'awk "NR>1" {} | '
        'awk -F"\\t" \'{{print $1}}\' | '
        'uniq'.format(exports_config.config_map['bed_file'])
    ],
                                         shell=True).decode("utf-8")
    #input(bed_filter)
    bed_filter = bed_filter.strip().split('\n')
    bed_filter = bed_filter + ['chr' + a for a in bed_filter]
    bed_filter = ['\\t' + a + '\\t' for a in bed_filter]
    #input(bed_filter)

    header = 'ID\\tchrom\\tloc.start\\tloc.end\\tnum.mark\\tseg.mean'
    for i in range(len(export_data)):
        input_file = os.path.join(input_folder, export_data['FILE_NAME'][i])
        output_file = os.path.join(seg_temp, export_data['FILE_NAME'][i])
        sample_id = export_data['SAMPLE_ID'][i]

        helper.working_on(
            verb, 'Refactoring cols: {}'.format(export_data['FILE_NAME'][i]))
        output_temp = output_file + '.temp'
        # Get all the genes in the .bed; save each line with a matching gene; rename the Sample_ID
        # TODO get rid of this ugly & fragile bash script, rewrite using Python
        # See comments by LEH in earlier commit
        columns = '1'  # placeholder for num.mark columns
        cmd = 'echo "{}" > {}; '.format(header, output_temp) +\
              'cat  {} | '.format(input_file) +\
              'awk \'BEGIN{{split("{}",t); '.format('|'.join(bed_filter))+\
              'for (i in t) vals[t[i]]}} ($2 in vals)\' | '+\
              'awk -F"\\t" \'{{ OFS="\\t"; '+\
              'print "{}", $2, $3, $4, {}, $5}}\' >> {}; '.format(sample_id, output_temp, columns) +\
              'mv {} {}'.format(output_temp, output_file)
        calls.append(subprocess.Popen(cmd, shell=True))
        exports_config.config_map['input_folder'] = seg_temp
        exit_codes = [p.wait() for p in calls]
    if any(exit_codes):
        raise ValueError(
            'ERROR:: Something went wrong when parsing HMMCopy format file? Please resolve the issue'
        )
    if verb:
        print(exit_codes)
Esempio n. 7
0
def verify_final_seg_file(exports_config: Config.Config, verb):
    seg = open(
        os.path.join(exports_config.config_map['input_folder'],
                     exports_config.data_frame['FILE_NAME'][0]), 'w')

    header = seg.readline().strip().split('\t')
    minimum_header = [
        'ID', 'chrom', 'loc.start', 'loc.end', 'num.mark', 'seg.mean'
    ]

    helper.working_on(verb, message='Asserting minimum header is in SEG file.')
    if not all([a in header for a in minimum_header]):
        print([a if a not in header else '' for a in minimum_header])
        print(
            'Missing headers from SEG file have been printed above, please ensure the data is not missing.'
        )
        exit(1)
Esempio n. 8
0
def verify_final_discrete_file(exports_config: Config.Config, verb):
    data = open(
        os.path.join(exports_config.config_map['input_folder'],
                     exports_config.data_frame['FILE_NAME'][0]), 'w')

    t_config = exports_config.type_config
    header = data.readline().strip().split('\t')
    minimum_header = ['Entrez_Gene_Id', 'Hugo_Symbol']

    helper.working_on(
        verb,
        message='Asserting minimum header is in {} file.'.format(t_config))
    if not any([a in header for a in minimum_header]):
        print([a if a not in header else '' for a in minimum_header])
        print(
            'Missing header(s) from {} file have been printed above, ensure data isn\'t missing.'
            .format(t_config))
        exit(1)
Esempio n. 9
0
def generate_expression_zscore(meta_config: Config.Config, input_file, outputPath, gepcomp, tcga, verb):
    # Z-Scores written by Dr. L Heisler
    helper.working_on(verb, message='Reading FPKM Matrix ...')
    try:
        raw_data = pd.read_csv(input_file, sep='\t')
    except FileNotFoundError:
        print('{} wrong file or file path'.format(input_file))
        raise

    helper.working_on(verb, message='Processing FPKM Matrix ...')
    raw_scores = raw_data.drop(['Hugo_Symbol'], axis=1)
    means = raw_scores.mean(axis=1)
    sds = raw_scores.std(axis=1)

    z_scores = ((raw_scores.transpose() - means) / sds).transpose()
    z_scores = z_scores.fillna(0)
    z_scores_data = z_scores.round(decimals=4)
    z_scores_data = pd.concat([raw_data['Hugo_Symbol'], z_scores_data], axis=1)

    helper.working_on(verb, message='Writing FPKM Z-Scores Matrix ...')

    # Reformat the columns for comparison and TCGA data to keep only the columns used in the study mRNA expression continuous data
    if gepcomp or tcga:
        study_columns = []
        for k in range(meta_config.data_frame.shape[0]):
            study_columns.append(meta_config.data_frame['SAMPLE_ID'][k])
        study_columns.insert(0,'Hugo_Symbol')
        z_scores_data = z_scores_data[study_columns]
        
        # Create the supplementary_data directory if it doesn't exist
        if not os.path.exists(os.path.join(outputPath, 'supplementary_data')):
            os.makedirs(os.path.join(outputPath, 'supplementary_data'), exist_ok=True)
    
    if gepcomp:
        # Output comparison Z scores
        output_file_z_scores = os.path.join(outputPath, 'supplementary_data', 'data_{}_comparison.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE']))
        z_scores_data.to_csv(output_file_z_scores, sep="\t", index=False)

        # Delete all gepcomp files that are not in the supplementary folder
        os.remove(input_file)

    elif tcga:
        # Output TCGA Z scores
        output_file_z_scores = os.path.join(outputPath, 'supplementary_data', 'data_{}_tcga.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE']))
        z_scores_data.to_csv(output_file_z_scores, sep="\t", index=False)
        
        # Delete all TCGA files that are not in the supplementary folder
        os.remove(input_file)
        
    else:
        # Output study Z scores
        output_file_z_scores = os.path.join(outputPath, 'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE']))
        z_scores_data.to_csv(output_file_z_scores, sep="\t", index=False)
Esempio n. 10
0
def main():
    global meta_config
    global study_config
    global janus_path
    global verb

    helper.working_on(
        verb,
        message='Gathering and decompressing SEG files into temporary folder')
    helper.decompress_to_temp(meta_config, study_config, verb)
    helper.working_on(verb)

    helper.working_on(
        verb,
        message=
        'Fixing HMMCopy formatting, chromosome, and chromosome max-length ...')
    fix_hmmcopy_tsv(meta_config, study_config, verb)
    fix_chrom(meta_config, study_config, verb)

    ### fix_hmmcopy_max_chrom fixes the maximum chromosome length AND imputes the num.mark value
    fix_hmmcopy_max_chrom(meta_config, study_config, janus_path, verb)

    helper.working_on(verb)

    helper.working_on(verb, message='Fixing .SEG IDs')
    fix_seg_id(meta_config, study_config, verb)
    helper.working_on(verb)

    helper.working_on(verb, message='Concating SEG Files to export folder')
    helper.concat_files(meta_config, study_config, verb)
    helper.working_on(verb)
Esempio n. 11
0
def main():
    global meta_config
    global study_config
    global janus_path
    global verb

    helper.working_on(
        verb,
        message=
        'Gathering and decompressing MRNA_EXPRESSION files into temporary folder'
    )
    helper.decompress_to_temp(meta_config, study_config, verb)
    helper.working_on(verb)

    helper.working_on(verb, message='Alpha sorting each file ...')
    alpha_sort(meta_config, verb)
    helper.working_on(verb)

    helper.working_on(verb, message='Generating expression matrix ...')
    generate_expression_matrix(meta_config, study_config, verb)
    helper.working_on(verb)

    # Works because shorting ...
    if 'zscores' in meta_config.config_map.keys(
    ) and meta_config.config_map['zscores'].lower() == 'true':
        helper.working_on(verb,
                          message='Generating expression Z-Score Meta ...')
        meta.generate_meta_type(meta_config, study_config, verb)
        #meta.generate_meta_type(meta_config.alterationtype + '_ZSCORES',
        #                {'profile_name': 'mRNA expression z-scores','profile_description': 'Expression level z-scores'}, study_config, verb)
        helper.working_on(verb)

        helper.working_on(verb,
                          message='Generating expression Z-Score Data ...')
        generate_expression_zscore(meta_config, study_config, verb)
        helper.working_on(verb)
Esempio n. 12
0
def main():
    global meta_config
    global study_config
    global janus_path
    global logger

    # imports are moved into the main (and only) method to work with the legacy component class
    import logging
    import os
    from support import helper
    from generate import meta
    from generate.analysis_pipelines.MRNA_EXPRESSION.support_functions import alpha_sort, generate_expression_matrix, generate_expression_percentile, generate_expression_zscore, preProcRNA
    from constants.constants import config2name_map
    from utilities.constants import DATA_DIRNAME

    verb = logger.isEnabledFor(logging.INFO) # TODO replace the 'verb' switch with logger

    if meta_config.config_map.get('genelist'):
        genelist = meta_config.config_map.get('genelist')
    else:
        genelist = os.path.join(os.path.dirname(__file__), DATA_DIRNAME, 'targeted_genelist.txt')
    if meta_config.config_map.get('enscon'):
        enscon = meta_config.config_map.get('enscon')
    else:
        enscon = os.path.join(os.path.dirname(__file__), DATA_DIRNAME, 'ensemble_conversion.txt')

    logger.info('Started processing data for CAP_expression pipeline')
    
    logger.info('Decompressing MRNA_EXPRESSION files to temporary folder')
    meta_config = helper.relocate_inputs(meta_config, study_config, verb)

    logger.info('Alpha sorting each file ...')
    alpha_sort(meta_config, verb)

    logger.info('Generating expression matrix ...')
    generate_expression_matrix(meta_config, study_config, verb)

    #preProcRNA - generate processed continuous data using the generated expression matrix - one for study and one for study comparison and one for TCGA data
    preProcRNA(meta_config, study_config, '/data_{}_gepcomp.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler]), enscon, genelist, True, False)
    preProcRNA(meta_config, study_config, '/data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler]), enscon, genelist, False, True)

    if meta_config.config_map.get('zscores'):
        # Generate the z-scores for mRNA expression data
        logger.info('Generating expression Z-Score Data ...')
        generate_expression_zscore(meta_config, os.path.join(study_config.config_map['output_folder'],
            'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler])),
            study_config.config_map['output_folder']
            , False, False, verb)

        # Generate the mRNA expression percentile data
        logger.info('Generating expression Percentile Data ...')
        generate_expression_percentile(meta_config, os.path.join(study_config.config_map['output_folder'],
            'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])),
            study_config.config_map['output_folder']
            , False, False, verb)

        # Generate the z-score sfor mRNA expression comparison data
        logger.info('Generating expression Z-Score comparison Data ...')
        generate_expression_zscore(meta_config, os.path.join(study_config.config_map['output_folder'],
            'data_{}_gepcomp.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler])),
            study_config.config_map['output_folder']
            , True, False, verb)

        # Generate the mRNA expression comparison percentile data
        logger.info('Generating expression Percentile comparison Data ...')
        generate_expression_percentile(meta_config, os.path.join(study_config.config_map['output_folder'], 'data_{}.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])),
            study_config.config_map['output_folder']
            , True, False, verb)

        # Generate the z-scores for mRNA expression TCGA data
        helper.working_on(verb, message='Generating expression TCGA Z-Score Data ...')
        generate_expression_zscore(meta_config, os.path.join(study_config.config_map['output_folder'],
            'data_{}_tcga.txt'.format(config2name_map[meta_config.alterationtype + ":" + meta_config.datahandler])),
            study_config.config_map['output_folder']
            , False, True, verb)

        # Generate the TCGA mRNA expression percentile data
        logger.info('Generating expression TCGA Percentile Data ...')
        generate_expression_percentile(meta_config, os.path.join(study_config.config_map['output_folder'], 'supplementary_data', 
            'data_{}_tcga.txt'.format(config2name_map[meta_config.alterationtype + ":" + 'Z-SCORE'])),
            study_config.config_map['output_folder']
            , False, True, verb)

    # Generate meta data within the handler and not in generator.py
    # Generate metadata for mRNA expression continuous data
    logger.info('Generating expression Meta ...')
    meta.generate_meta_type(meta_config,study_config,logger)
    
    # Generate metadata for mRNA expression z-score data
    if meta_config.config_map.get('zscores'):
        logger.info('Generating expression Z-Score Meta ...')
        meta_config.datahandler = 'Z-SCORE'
        meta.generate_meta_type(meta_config,study_config,logger)

    logger.info('Finished processing data for CAP_expression pipeline')
Esempio n. 13
0
def generate_expression_matrix(exports_config: Config.Config, study_config: Config.Config, verb):
    # Output for data_expression_continuous_expression.txt data file
    output_file = os.path.join(study_config.config_map['output_folder'],
                               'data_{}.txt'.format(config2name_map[exports_config.alterationtype + ":" + exports_config.datahandler]))

    helper.working_on(verb, message='Reading FPKM data ...')
    info: DataFrames = []
    for i in range(exports_config.data_frame.shape[0]):
        info.append(pd.read_csv(os.path.join(exports_config.config_map['input_folder'],
                                             exports_config.data_frame['FILE_NAME'][i]),
                                sep='\t',
                                usecols=['gene_id','FPKM'])
                    .rename(columns={'FPKM': exports_config.data_frame['SAMPLE_ID'][i],
                                     'gene_id': 'Hugo_Symbol'})
                    .drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=False))

    helper.working_on(verb, message='Merging all FPKM data ...')
    if len(info) == 0:
        raise ImportError('Attempting to import zero expression data, please remove expression data from study.')
    elif len(info) == 1:
        result = info[0]
    else:
        result = info[0]
        for i in range(1, len(info)):
            result: pd.DataFrame = pd.merge(result, info[i], how='outer', on='Hugo_Symbol')
            result.drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=True)
    result.replace(np.nan, 0, inplace=True)

    helper.working_on(verb, message='Writing all FPKM data ...')
    
    result.to_csv(output_file, sep='\t', index=None)

    # Append the gepcomp datafiles (if any)
    gep_file = exports_config.config_map.get('gepfile')
    if gep_file != None and os.path.exists(gep_file):
        geplist = pd.read_csv(gep_file, sep=',')
        geplist.columns = ['patient_id', 'file_name']
 
        # Filter out the patient ID's that have already been included in the study
        indices = []
        for i in range(exports_config.data_frame.shape[0]):
            for a, elem in enumerate(geplist.patient_id.tolist()):
                if exports_config.data_frame['PATIENT_ID'][i] in elem:
                    indices.append(a)
        geplist = geplist.drop(indices)

        for index, row in geplist.iterrows():
            info.append(pd.read_csv(row.file_name,
                                sep='\t',
                                usecols=['gene_id','FPKM'])
            .rename(columns={'FPKM': row.patient_id,
                                     'gene_id': 'Hugo_Symbol'})
            .drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=False))


        helper.working_on(verb, message='Merging all FPKM data ...')
        if len(info) == 0:
            raise ImportError('Attempting to import zero expression data, please remove expression data from study.')
        elif len(info) == 1:
            result = info[0]
        else:
            result = info[0]
            for i in range(1, len(info)):
                result: pd.DataFrame = pd.merge(result, info[i], how='left', on='Hugo_Symbol')
                result.drop_duplicates(subset='Hugo_Symbol', keep='last', inplace=True)
        result.replace(np.nan, 0, inplace=True)

        helper.working_on(verb, message='Writing all FPKM data ...')
    
        # Output the gepcomp data
        output_file_comp = os.path.join(study_config.config_map['output_folder'], 
                'data_{}_gepcomp.txt'.format(config2name_map[exports_config.alterationtype + ":" + exports_config.datahandler]))
        result.to_csv(output_file_comp, sep='\t', index=None)
Esempio n. 14
0
def main():
    global meta_config
    global study_config
    global janus_path
    global verb

    helper.working_on(verb, message='Gathering and decompressing SEG files into temporary folder')
    helper.decompress_to_temp(meta_config, study_config, verb)
    helper.working_on(verb)

    helper.working_on(verb, message='Fixing Chromosome numbering ...')
    fix_chrom(meta_config, study_config, verb)
    helper.working_on(verb)

    helper.working_on(verb, message='Fixing .SEG IDs')
    fix_seg_id(meta_config, study_config, verb)
    helper.working_on(verb)

    helper.working_on(verb, message='Concating SEG Files to export folder')
    helper.concat_files(meta_config, study_config, verb)
    helper.working_on(verb)