コード例 #1
0
def subset_df_for_each_comparison(df: pd.DataFrame, base_df: pd.DataFrame,
                                  comparisons_dict: dict):

    values_prefix = rule_params["all"]["values_cols_prefix"]
    abundance_df = df.filter(regex=values_prefix)

    for comparison in comparisons_dict.keys():

        # get abundances values for reference
        reference_abundances_col = abundance_df.filter(
            regex=comparisons_dict[comparison]["reference"])

        # Add a descriptor in column names / useful if several different control to use, more flexible than in the
        # config file (
        reference_abundances_col = reference_abundances_col.add_suffix(
            '_reference')

        # get abundances values for the condition to compare with reference
        condition_abundances_col = df.filter(
            regex=comparisons_dict[comparison]["condition"])

        # create complete dataframe
        result_df = pd.concat(
            [base_df, reference_abundances_col, condition_abundances_col],
            axis=1)

        # export result
        output_result = os.path.join(os.path.dirname(args.output_file),
                                     '{}'.format(comparison))

        logging.debug('Path to file: {}'.format(output_result))
        h.export_result_to_csv(result_df, output_result)
        logger.info('Data for {} exported to csv'.format(comparison))

    return
コード例 #2
0
ファイル: mapping.py プロジェクト: dagopian/ProteomX
    args = get_args()
    filename = h.filename(args.input_file)
    rule_params = h.load_json_parameter(args.file_id)

    logpath = os.path.join(paths.global_data_dir, args.file_id,
                           'log/mapping.log')
    logger = h.get_logger(logpath)
    logging.info('Starting mapping file: ' + args.input_file)

    # get data
    data_df = pd.read_csv(args.input_file, header=0, index_col=None)
    mapping_df = pd.read_csv(args.mapping_file, header=0, index_col=None)

    # get parameters
    values_cols_prefix = rule_params['all']['values_cols_prefix']
    col_for_mapping = rule_params['mapping']['col_for_mapping']
    col_label = rule_params['mapping']['col_label']

    # rename columns with abundances values in the data frame based on metadata in the mapping df
    result_df = fpreprocessing.rename_col_abundance_withjson(
        mapping_df, data_df, values_cols_prefix, col_for_mapping, col_label)

    # build json corresponding to new column name
    json_for_groups = "metadata_{}.json".format(filename)
    path_to_json = os.path.join(paths.global_data_dir, args.file_id,
                                json_for_groups)
    d = fpreprocessing.build_json(mapping_df, path_to_json, col_for_mapping)

    # export results
    h.export_result_to_csv(result_df, args.output_file)
コード例 #3
0
    # create json with information on % of NaN for samples
    out = os.path.join(paths.global_data_dir, args.file_id, 'missing_values',
                       'samples_{}.json'.format(filename))
    fqc.export_json_sample(stats_per_sample, out, values_cols_prefix)

    # filter dataframe for following analysis
    # remove row to discard
    filtered_df = fqc.remove_flagged_rows(result_df, 'exclude_na', 1)
    # remove samples to discard AND keep only base df (as defined in the config file) and abundances values columns
    filtered_df = fqc.remove_flagged_samples(
        df=filtered_df,
        boolean_mask=stats_per_sample['to_exclude'],
        metadata_col=rule_params['all']['metadata_col'],
        values_col_prefix=rule_params['all']['values_cols_prefix'],
        keep_specific=keep_specific,
        col_name=col_name)

    # Export dataframe with only proteins/samples compliant with threshold
    h.export_result_to_csv(filtered_df, args.output_file_filtered)

    # Export dataframe with all data and information on nan percentage per group and protein
    h.export_result_to_csv(result_df, args.output_file_complete)

    logging.info("Keeping " + str(len(filtered_df)) +
                 " proteins with current parameters.")
    logging.info(
        "Keeping " +
        str(len(stats_per_sample[stats_per_sample['to_exclude'] == False])) +
        " samples with current parameters.")
コード例 #4
0
ファイル: filter_data.py プロジェクト: cbib/ProteomX
    # load data
    if args.input_file:
        data_df = pd.read_csv(args.input_file, header=0, index_col=None)
        result = update_overlap(data_df)

        # get subset of data compliant with criterion defined in config file - returns dictionary with dataframe
        subsets_data = h.subset_data(data_df, subset_filters)

        for subset_name in subsets_data:
            df = subsets_data[subset_name]

            # remove extension in path file
            output_file = re.sub(
                '.csv', '', args.output_file) + '_{}.csv'.format(subset_name)
            h.export_result_to_csv(df, output_file)

    elif args.input_directory:
        files = list_files_in_dir(args.input_directory, '.csv')
        for file in files:
            data_df = pd.read_csv(file, header=0, index_col=None)

            # update overlap
            result = update_overlap(data_df)

            # get subset of data on which to perform enrichment - returns dictionary with dataframe
            subsets_data = h.subset_data(data_df, subset_filters)

            for subset_name in subsets_data:
                df = subsets_data[subset_name]
コード例 #5
0
ファイル: distribution.py プロジェクト: dagopian/ProteomX
        # Divide data frame in specific and aspecific proteins rows
        result, specific_proteins_df = extract_specific_proteins(data_df)

        # Add arbitrary p-value for specific proteins
        reference = rule_params["all"]["reference"]
        specific_proteins_pval = update_res_with_specific_proteins(specific_proteins_df, reference, test)

    else:
        result = data_df.copy()

    # Compute z-score
    res_zscore = compute_z_score(result)

    # Now find which distribution fits the best
    best_dist, args_param = find_best_distribution(res_zscore, args.histogramm_distribution)

    # compute p-value from the distribution
    res_pval = compute_p_value(result, rule_params['distribution']['test_type'], best_dist, args_param)

    # Concatenate results on aspecific et specific proteins:
    if rule_params['all']['specific_proteins']:
        res_pval = pd.concat([res_pval, specific_proteins_pval], axis=0)

    # log results
    significant = len(res_pval[res_pval['pvalue'] < 0.05])

    logger.info("{} proteins are significant (p-value < 0.05).".format(str(significant)))

    # export results
    h.export_result_to_csv(res_pval, args.output_file)
コード例 #6
0
                                        args.analysis_id)
    try:
        os.mkdir(path2analysis_folder)
        logger.debug("Creating folder for this analysis")
    except FileExistsError:
        logger.debug("Analysis folder already created")

    path2error_file = os.path.join(paths.global_data_dir, args.analysis_id,
                                   args.error_file)

    logger.debug("Exporting import_error file to: {}".format(path2error_file))
    with open(args.error_file, 'w+') as json_file:
        json.dump(errors, json_file, indent=True)

    # export header
    output_sample_name = os.path.join(paths.global_data_dir, args.analysis_id,
                                      args.output_sample_name)

    logger.debug("Exporting header file to: {}".format(output_sample_name))
    fi.get_sample_name(df, output_sample_name)

    # export data
    if args.output_file:
        output_csv = args.output_file
    else:
        output_csv = os.path.join(
            paths.global_data_dir, args.analysis_id,
            "csv/{}.csv".format(h.filename(args.input_file)))
    logger.debug("Exporting converted file to: {}".format(output_csv))
    h.export_result_to_csv(df, output_csv, index_col=True)