Esempio n. 1
0
def main(args):

    # Parse input gcts
    external_gct = parse.parse(args.external_gct_path)
    internal_gct = parse.parse(args.internal_gct_path)
    bg_gct = parse.parse(args.bg_gct_path)

    # Meat of the script
    (sim_gct, conn_gct) = do_steep_and_sip(
        external_gct, internal_gct, bg_gct, args.similarity_metric,
        args.connectivity_metric,
        args.fields_to_aggregate_for_external_profiles,
        args.fields_to_aggregate_for_internal_profiles)

    # Write output gcts
    wg.write(sim_gct,
             args.out_steep_name,
             data_null="NaN",
             metadata_null="NaN",
             filler_null="NaN")
    wg.write(conn_gct,
             args.out_sip_name,
             data_null="NaN",
             filler_null="NaN",
             metadata_null="NaN")
Esempio n. 2
0
def main(args):

    # Parse gct file
    gct = parse.parse(args.path_to_gct)

    # Parse mapping tsv file
    mapping = pd.read_csv(args.path_to_mapping_tsv, sep="\t", index_col=0)

    # Make sure the ids from the mapping file are unique
    duplicated_bool_array = mapping.index.duplicated()
    assert sum(duplicated_bool_array) == 0, (
        "ids in mapping file must be unique. duplicated ids in mapping:\n{}".
        format(mapping.index[duplicated_bool_array]))

    for col in mapping.columns:

        if args.row_and_or_col == "both":
            annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
            annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
        elif args.row_and_or_col == "row":
            annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
        elif args.row_and_or_col == "col":
            annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)

    wg.write(gct,
             args.out_name,
             filler_null="NA",
             data_null="NaN",
             metadata_null="NA")
Esempio n. 3
0
def main():
    # Get args
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)

    # Read the input gct
    in_gct = parse.parse(args.in_gct_path)

    # Read in each of the command line arguments
    rid = _read_arg(args.rid)
    cid = _read_arg(args.cid)
    exclude_rid = _read_arg(args.exclude_rid)
    exclude_cid = _read_arg(args.exclude_cid)

    # Slice the gct
    out_gct = sg.slice_gctoo(in_gct,
                             rid=rid,
                             cid=cid,
                             exclude_rid=exclude_rid,
                             exclude_cid=exclude_cid)
    assert out_gct.data_df.size > 0, "Slicing yielded an empty gct!"

    # Write the output gct
    if args.use_gctx:
        wgx.write(out_gct, args.out_name)
    else:
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
Esempio n. 4
0
def write_output_gct(out_gct, out_gct_name, data_null, filler_null):

    wg.write(out_gct,
             out_gct_name,
             data_null=data_null,
             filler_null=filler_null,
             data_float_format=None)
Esempio n. 5
0
def main(args):

    # Import data
    assert os.path.exists(
        args.in_gct_path), ("in_gct_path could not be found: {}").format(
            args.in_gct_path)
    in_gct = parse(args.in_gct_path)

    # First, check if any rows are all NaN; if so, remove them
    dropped_df = in_gct.data_df.dropna(how="all")
    bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values)
    in_gct = sg.slice_gctoo(in_gct, row_bool=bools_of_remaining)

    if args.replace_with == "zero":
        in_gct.data_df.fillna(0, inplace=True)

    elif args.replace_with == "median":
        probe_medians = in_gct.data_df.median(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_medians[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    elif args.replace_with == "mean":
        probe_means = in_gct.data_df.mean(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_means[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    wg.write(in_gct, args.out_name, filler_null="NA")
Esempio n. 6
0
    def test_main(self):
        out_name = os.path.join(FUNCTIONAL_TESTS_PATH, "test_main_out.gct")

        gctoo = GCToo.GCToo(data_df=self.data_df,
                            row_metadata_df=self.row_metadata_df,
                            col_metadata_df=self.col_metadata_df)
        wg.write(gctoo,
                 out_name,
                 data_null="NaN",
                 metadata_null="-666",
                 filler_null="-666")

        # Read in the gct and verify that it's the same as gctoo
        new_gct = pg.parse(out_name)

        pd.util.testing.assert_frame_equal(new_gct.data_df, gctoo.data_df)
        pd.util.testing.assert_frame_equal(new_gct.row_metadata_df,
                                           gctoo.row_metadata_df)
        pd.util.testing.assert_frame_equal(new_gct.col_metadata_df,
                                           gctoo.col_metadata_df)

        # Also check that missing values were written to the file as expected
        in_df = pd.read_csv(out_name,
                            sep="\t",
                            skiprows=2,
                            keep_default_na=False)
        self.assertEqual(in_df.iloc[0, 1], "-666")
        self.assertEqual(in_df.iloc[5, 6], "NaN")

        # Cleanup
        os.remove(out_name)
Esempio n. 7
0
def main(args):
    """ The main method. """

    # Import gct
    in_gct = parse.parse(args.in_gct_path)

    # Create the separated gcts
    (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field,
                                            args.row_or_col)

    # Save the returned gcts
    for gct, name in zip(out_gcts, out_gct_prefixes):
        full_out_name = os.path.join(
            args.out_dir,
            args.out_name_prefix + str(name) + args.out_name_suffix)

        # Write to GCT or GCTX depending on extension
        if str.lower(os.path.splitext(full_out_name)[1]) == ".gct":
            wg.write(gct,
                     full_out_name,
                     data_null="NaN",
                     metadata_null="NA",
                     filler_null="NA")
        elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx":
            wgx.write(gct, full_out_name)
        else:
            raise (Exception(
                "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}"
                .format((args.out_name_suffix))))
Esempio n. 8
0
File: card.py Progetto: cmap/merino
def reader_writer(input_file, output_file, function, check_size=False):
    plate_failure = False
    # Read in input file
    gctoo = pe.parse(input_file)
    # Call normalizing function on gctoo
    new_gctoo = function(gctoo)

    new_gctoo = drop_nans(new_gctoo)
    if new_gctoo == 'empty_plate':
        logger.debug("{} has no usable data and has not been written.".format(
            os.path.basename(output_file)))
        plate_failure = True
        return plate_failure

    # If told to, check size of new_gctoo and flag if too small
    if new_gctoo.data_df.shape[1] <= 349 and check_size == True:
        logger.debug('{} Plate Failure With {} Failed Wells'.format(
            os.path.basename(os.path.dirname(input_file)),
            384 - new_gctoo.data_df.shape[1]))
        plate_failure = True

    # write out new gctoo
    wgx.write(new_gctoo, out_fname=output_file)
    logger.debug("{} file written.".format(output_file))

    return plate_failure
Esempio n. 9
0
def main():
    # get args
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    logger.debug("args:  {}".format(args))

    # Get files directly
    if args.input_filepaths is not None:
        files = args.input_filepaths

    # Or find them
    else:
        files = get_file_list(args.file_wildcard)

        # No files found
        if len(files) == 0:
            msg = "No files were found. args.file_wildcard: {}".format(
                args.file_wildcard)
            logger.error(msg)
            raise Exception(msg)

    # Only 1 file found
    if len(files) == 1:
        logger.warning(
            "Only 1 file found. No concatenation needs to be done, exiting")
        return

    # More than 1 file found
    else:
        # Parse each file and append to a list
        gctoos = []
        for f in files:
            gctoos.append(parse(f))

        # Create concatenated gctoo object
        if args.concat_direction == "horiz":
            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

        elif args.concat_direction == "vert":
            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

    # Write out_gctoo to file
    logger.info("Writing to output file args.out_name:  {}".format(
        args.out_name))

    if args.out_type == "gctx":
        write_gctx.write(out_gctoo, args.out_name)

    elif args.out_type == "gct":
        write_gct.write(out_gctoo,
                        args.out_name,
                        filler_null=args.filler_null,
                        metadata_null=args.metadata_null,
                        data_null=args.data_null)
Esempio n. 10
0
File: steep.py Progetto: yuanjun/psp
def main(args):

    # Read in the first gct
    gct1 = parse(args.in_gct_path)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse(args.in_gct2_path)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    if os.path.splitext(args.out_name)[1] == ".gct":
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
    elif os.path.splitext(args.out_name)[1] == ".gctx":
        wgx.write(out_gct, args.out_name)
    else:
        raise (Exception(
            "out_name must end in .gct or .gctx. out_name: {}".format(
                args.out_name)))
Esempio n. 11
0
def concat_main(args):
    """ Separate method from main() in order to make testing easier and to
    enable command-line access. """

    # Get files directly
    if args.input_filepaths is not None:
        files = args.input_filepaths

    # Or find them
    else:
        files = get_file_list(args.file_wildcard)

        # No files found
        if len(files) == 0:
            msg = "No files were found. args.file_wildcard: {}".format(
                args.file_wildcard)
            logger.error(msg)
            raise Exception(msg)

    # Only 1 file found
    if len(files) == 1:
        logger.warning(
            "Only 1 file found. No concatenation needs to be done, exiting")
        return

    # More than 1 file found
    else:
        # Parse each file and append to a list
        gctoos = []
        for f in files:
            gctoos.append(parse.parse(f))

        # Create concatenated gctoo object
        if args.concat_direction == "horiz":
            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

        elif args.concat_direction == "vert":
            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

    # Write out_gctoo to file
    logger.info("Writing to output file args.out_name:  {}".format(
        args.out_name))

    if args.out_type == "gctx":
        write_gctx.write(out_gctoo, args.out_name)

    elif args.out_type == "gct":
        write_gct.write(out_gctoo,
                        args.out_name,
                        filler_null=args.filler_null,
                        metadata_null=args.metadata_null,
                        data_null=args.data_null)
Esempio n. 12
0
def main(args):

	gct = parse.parse(args.in_gct_path)

	(_, conn_gct) = do_steep_and_sip(
		gct, args.similarity_metric,
		args.connectivity_metric, args.fields_to_aggregate)

	# Write output gct
	wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
Esempio n. 13
0
def save_data(adj_ds, adj_list):
    """ Write batch-adjusted data to files
    """
    wg.write(adj_ds, 'batch_adjusted_values.gct')
    for ctr, this_ds in enumerate(adj_list):
        if this_ds.src is not None:
            out_file = '{}.COMBAT.gct'.format(os.path.splitext(os.path.basename(this_ds.src))[0])
        else:
            out_file = 'batch_adjusted_values_X{}.gct'.format(ctr)
        wg.write(this_ds, out_file)
Esempio n. 14
0
def main():
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)
    if args.output_filepath == None:
        basename = os.path.basename(args.filename)
        out_name = ".".join(basename.split(".")[:-1])
    else:
        out_name = args.output_filepath

    write_gct.write(in_gctoo, out_name)
Esempio n. 15
0
def gctx2gct_main(args):
    """ Separate from main() in order to make command-line tool. """

    in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)

    if args.output_filepath is None:
        basename = os.path.basename(args.filename)
        out_name = os.path.splitext(basename)[0] + ".gct"
    else:
        out_name = args.output_filepath

    write_gct.write(in_gctoo, out_name)
Esempio n. 16
0
def main(args):

    # Find files
    full_path_wildcard = args.in_dir + args.file_wildcard
    gct_paths = glob.glob(full_path_wildcard)

    assert len(gct_paths) > 1, "full_path_wildcard: {}".format(
        full_path_wildcard)

    # Extract prefixes in order to use them later for saving
    prefixes = [(os.path.basename(path)).split(args.prefix_separator)[0]
                for path in gct_paths]

    for path, prefix in zip(gct_paths, prefixes):
        print "path: {}".format(path)
        print "prefix: {}".format(prefix)

    # Import gcts
    gctoos = [parse(x) for x in gct_paths]

    assert len(gctoos) > 1, "gct_paths: {}".format(gct_paths)

    # Compute & save ranks
    for g, prefix in zip(gctoos, prefixes):

        # Extract data_df
        score_df = g.data_df

        # Must be square
        assert score_df.shape[0] == score_df.shape[
            1], "Input dataframe must be square."

        # Set diagonal to NaN
        np.fill_diagonal(score_df.values, np.nan)

        # Rank the matrix (percentile score or not)
        if args.do_percentile_rank:
            rank_df = score_df.rank(ascending=False, pct=True) * 100
        else:
            rank_df = score_df.rank(ascending=False)

        # Make a GCToo
        rank_gctoo = GCToo.GCToo(data_df=rank_df,
                                 row_metadata_df=g.row_metadata_df,
                                 col_metadata_df=g.col_metadata_df)

        # Save the rank_df to file
        out_name = args.out_dir + prefix + args.output_suffix
        wg.write(rank_gctoo,
                 out_name,
                 filler_null="NaN",
                 data_null="NaN",
                 metadata_null="NaN")
Esempio n. 17
0
def main(args):

    # Read in the first gct
    gct1 = parse(args.in_gct_path, convert_neg_666=False, make_multiindex=True)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse(args.in_gct2_path,
                     convert_neg_666=False,
                     make_multiindex=True)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    wg.write(out_gct,
             args.out_name,
             data_null="NaN",
             metadata_null="NA",
             filler_null="NA")
Esempio n. 18
0
File: weave.py Progetto: cmap/merino
def write_outputs(top_level_dir, weights, cb_weights, modZ_GCT, cb_modZ_GCT, cc_q75_df, cb_cc_q75_df, rep_set, input_type):

    if not os.path.exists(top_level_dir):
        os.mkdir(top_level_dir)


    weights[0].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_norm_weights.txt'), sep='\t')
    cb_weights[0].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_norm_weights.txt'), sep='\t')
    weights[1].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_raw_weights.txt'), sep='\t')
    cb_weights[1].to_csv(os.path.join(top_level_dir,rep_set + '_'+input_type+ '_raw_weights.txt'), sep='\t')
    wg.write(modZ_GCT, os.path.join(top_level_dir,rep_set + '_MODZ.{}'.format(input_type.split('.')[0])))
    wg.write(cb_modZ_GCT, os.path.join(top_level_dir,rep_set + '_MODZ.{}.COMBAT'.format(input_type.split('.')[0])))
    cc_q75_df.to_csv(os.path.join(top_level_dir,rep_set + '_' + 'MODZ.{}_cc_q75.txt'.format(input_type.split('.')[0])), sep='\t')
    cb_cc_q75_df.to_csv(os.path.join(top_level_dir,rep_set + '_' + 'MODZ.' + input_type + '.COMBAT' + '_cc_q75.txt'), sep='\t')
Esempio n. 19
0
File: weave.py Progetto: cmap/merino
def weave(proj_dir, replicate_set_name, args, input_type='ZSPC', nprofile_drop=True):

    gct_list = define_replicate_set_files_and_parse(proj_dir, input_type, replicate_set_name)

    if gct_list == False:
        return

    if args.aggregate_output:
        top_level_dir = os.path.join(proj_dir, "weave")
    else:
        top_level_dir = os.path.join(proj_dir, 'weave', replicate_set_name)

    reload(distil)

    group_by_list = [x for x in args.group_by.split(',')]

    #Perform ComBat adjustment
    if args.davepool_combat == True:

        all_ds, pre_list = batch_adjust.combat_by_group(gct_list, col_group=group_by_list, batch_field='davepool_id')
        all_ds, combat_adjusted_gct_list = batch_adjust.combat_by_group(pre_list, col_group=group_by_list, batch_field='pool_id')

    else:
        all_ds, combat_adjusted_gct_list = batch_adjust.combat_by_group(gct_list, col_group=group_by_list, batch_field='pool_id')
        print 'here'
        logger.info('here')
        logger.debug("sample combat adjusted gct shape {}".format(combat_adjusted_gct_list[0].data_df.shape))

    # Write out ComBat adjusted GCTs
    for combat_adjusted_gct in combat_adjusted_gct_list:
        replicate_name = combat_adjusted_gct.col_metadata_df['prism_replicate'].unique()[0]
        wg.write(combat_adjusted_gct, os.path.join(proj_dir, 'card', replicate_name,replicate_name + '_' + input_type + '.COMBAT.gct'))


    if args.skip is not None:
        modZ_GCT, cc_q75_df, weights = distil.calculate_modz(gct_list, group_by=group_by_list, skip=json.loads(args.skip))
        cb_modZ_GCT, cb_cc_q75_df, cb_weights = distil.calculate_modz(combat_adjusted_gct_list, group_by=group_by_list, skip=json.loads(args.skip))

    else:
        modZ_GCT, cc_q75_df, weights = distil.calculate_modz(gct_list, group_by=group_by_list)
        cb_modZ_GCT, cb_cc_q75_df, cb_weights = distil.calculate_modz(combat_adjusted_gct_list, group_by=group_by_list)

    # Filter out signatures where nprofile = 1
    if nprofile_drop==True:
        (modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df) = drop_less_than_2_replicates(modZ_GCT, cc_q75_df, cb_modZ_GCT, cb_cc_q75_df)

    # outfile = os.path.join(top_level_dir, 'MODZ.{}'.format(input_type), replicate_set_name)
    # cb_outfile = os.path.join(top_level_dir, 'MODZ.{}.COMBAT'.format(input_type), replicate_set_name)

    write_outputs(top_level_dir,weights, cb_weights, modZ_GCT, cb_modZ_GCT, cc_q75_df, cb_cc_q75_df, replicate_set_name, input_type)
Esempio n. 20
0
def main(args):

    # Import data
    in_gct = parse(args.in_gct_path)

    # Compute distance df
    dist_df = 1 - in_gct.data_df

    # Create distance gct
    dist_gct = GCToo.GCToo(dist_df, in_gct.row_metadata_df,
                           in_gct.col_metadata_df)

    # Write dist_gct to file
    wg.write(dist_gct, args.out_name, filler_null="NA")
Esempio n. 21
0
def contin_renorm_main(args):
    """ Separate method from main() in order to make testing easier and to
    enable command-line access. """

    # print in_gct.data_df.isnull().apply(np.sum, axis=1)

    gct = parse.parse(args.in_gct_path)
    data_df = gct.data_df.copy(deep=True)
    row_metadata_df = gct.row_metadata_df.copy(deep=True)
    col_metadata_df = gct.col_metadata_df.copy(deep=True)

    ### hack to remove rows that are all NA values

    data_df = data_df.loc[(
        data_df.isnull().apply(np.sum, axis=1) < data_df.shape[1]), :]

    # enrichment_score
    es = col_metadata_df.loc[:, "det_well_enrichment_score"].copy(deep=True)

    # calculate lim as x approaches 1 for non-median normalized data
    pep_y_offsets = calc_y_offsets(data_df, es)

    # calculate the fit_params
    fit_params = calc_fit(data_df, es, pep_y_offsets)

    # annotate which need to be renormalized
    row_metadata_df["is_log_renormed"] = is_log_renormed(
        fit_params.loc[:, "deg1"].apply(get_slope), args.slope_cutoff)

    # calculate the offset matrix
    offset_mat = calc_pep_samp_offsets(data_df, row_metadata_df, es,
                                       fit_params, pep_y_offsets)

    # calculate the output data
    out_data_df = calc_out_mat(data_df, offset_mat)

    # add the metadata field
    col_metadata_df["renorm_correction"] = calc_tot_samp_offsets(offset_mat)

    #return GCToo.GCToo(data_df=out_data_df,
    #                    col_metadata_df=col_metadata_df,
    #                    row_metadata_df=row_metadata_df)

    # write the file
    write_gct.write(
        GCToo.GCToo(data_df=out_data_df,
                    col_metadata_df=col_metadata_df,
                    row_metadata_df=row_metadata_df), args.out_name)
Esempio n. 22
0
def continuous_renormalization(args):

    # Read in GCT, if path provided, and make deep copies of all DataFrames
    if args.in_gct_path:
        gct = parse.parse(args.in_gct_path)
    else:
        gct = args.in_gct
    data_df = gct.data_df.copy(deep=True)
    row_metadata_df = gct.row_metadata_df.copy(deep=True)
    col_metadata_df = gct.col_metadata_df.copy(deep=True)

    # Remove rows that are all NA values
    data_df = data_df.loc[(data_df.isnull().apply(np.sum, axis=1) < data_df.shape[1]), :]

    # Pull out enrichment scores from column metadata dataframe
    enrichment_scores = col_metadata_df.loc[:, "det_well_enrichment_score"].copy(deep=True)

    # Calculate limit as x approaches 1 for non-median normalized data
    pep_y_offsets = calculate_y_offsets(data_df, enrichment_scores)
    
    # Calculate the fit parameters
    fit_parameters = calculate_fit(data_df, enrichment_scores, pep_y_offsets)

    # Annotate which rows will be renormalized based on slope_cutoff argument (default 0.2)
    row_metadata_df["is_log_renormed"] = is_log_renormed(fit_parameters.loc[:, "deg1"].apply(get_slope),
                                                         args.slope_cutoff)
    
    # Calculate the offset matrix
    offset_mat = calculate_peptide_sample_offsets(data_df, row_metadata_df, enrichment_scores, fit_parameters,
                                                  pep_y_offsets)

    # Calculate the output DataFrame
    out_data_df = calculate_out_matrix(data_df, offset_mat)

    # Add the 'renorm_correction' metadata field with total sample offset values
    col_metadata_df["renorm_correction"] = calculate_total_sample_offsets(offset_mat)

    # Output
    if args.write_gct:
        write_gct.write(GCToo.GCToo(data_df=out_data_df,
                                col_metadata_df=col_metadata_df,
                                row_metadata_df=row_metadata_df),
                        args.out_name)
    else:
        return GCToo.GCToo(data_df=out_data_df,
                           col_metadata_df=col_metadata_df,
                           row_metadata_df=row_metadata_df)
Esempio n. 23
0
    def test_p100_functional(self):
        p100_in_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_p100.gct")
        p100_out_path = os.path.join(FUNCTIONAL_TESTS_PATH, "test_p100_writing.gct")

        # Read in original gct file
        p100_in_gct = pg.parse(p100_in_path)

        # Read in new gct file
        wg.write(p100_in_gct, p100_out_path)
        p100_out_gct = pg.parse(p100_out_path)

        self.assertTrue(p100_in_gct.data_df.equals(p100_out_gct.data_df))
        self.assertTrue(p100_in_gct.row_metadata_df.equals(p100_out_gct.row_metadata_df))
        self.assertTrue(p100_in_gct.col_metadata_df.equals(p100_out_gct.col_metadata_df))

        # Clean up
        os.remove(p100_out_path)
Esempio n. 24
0
def main(prism_replicate_name, outfile, all_perturbagens,
         davepool_data_objects, prism_cell_list):
    # Build one-to-many mapping between davepool ID and the multiple PRISM cell lines that are within that davepool
    davepool_id_to_cells_map = build_davepool_id_to_cells_map(prism_cell_list)

    # Put all the data in gct-able form
    (all_median_data_by_cell,
     all_count_data_by_cell) = process_data(davepool_data_objects,
                                            davepool_id_to_cells_map)

    # Create full outfile, build the gct, and write it out!
    median_outfile = os.path.join(outfile, "assemble", prism_replicate_name,
                                  prism_replicate_name + "_MEDIAN.gct")
    median_gctoo = build_gctoo(prism_replicate_name, all_perturbagens,
                               all_median_data_by_cell)
    write_gct.write(median_gctoo,
                    median_outfile,
                    data_null=_NaN,
                    filler_null=_null)

    # Write Inst info file
    instinfo_outfile = os.path.join(outfile, "assemble", prism_replicate_name,
                                    prism_replicate_name + "_inst_info.txt")
    inst = median_gctoo.col_metadata_df

    logger.info("Formatting instinfo pert_dose")
    # cast pert_dose field to str
    inst['pert_dose'] = inst['pert_dose'].apply(
        lambda el: process_pert_doses(el))

    if 'pert_idose' in inst.columns:
        logger.info("Formatting instinfo pert_idose")
        inst['pert_idose'] = inst['pert_idose'].apply(
            lambda el: process_pert_idoses(el))

    inst.to_csv(instinfo_outfile, sep='\t')
    logger.info("Instinfo has been written to {}".format(instinfo_outfile))

    count_outfile = os.path.join(outfile, "assemble", prism_replicate_name,
                                 prism_replicate_name + "_COUNT.gct")
    count_gctoo = build_gctoo(prism_replicate_name, all_perturbagens,
                              all_count_data_by_cell)
    write_gct.write(count_gctoo,
                    count_outfile,
                    data_null=_NaN,
                    filler_null=_null)
Esempio n. 25
0
def mk_cell_metadata(args, failed_plates):
    if args.aggregate_out:
        paths = glob.glob(
            os.path.join(args.proj_dir, args.search_pattern, 'card', '*',
                         '*NORM.gct'))
        mfi_paths = glob.glob(
            os.path.join(args.proj_dir, args.search_pattern, 'assemble', '*',
                         '*MEDIAN.gct'))
    else:
        paths = glob.glob(
            os.path.join(args.proj_dir, 'card', args.search_pattern,
                         '*NORM.gct'))
        mfi_paths = glob.glob(
            os.path.join(args.proj_dir, 'assemble', args.search_pattern,
                         '*MEDIAN.gct'))

    cell_temp = pe.parse(mfi_paths[0])
    cell_temp.row_metadata_df.to_csv(os.path.join(
        args.build_folder, args.cohort_name + '_cell_info.txt'),
                                     sep='\t')

    # Calculate SSMD matrix using paths that were just grabbed and write out
    ssmd_mat = ssmd.ssmd_matrix(cut_to_l2.cut_l1(paths))

    ssmd_gct = GCToo.GCToo(
        data_df=ssmd_mat,
        col_metadata_df=pd.DataFrame(index=ssmd_mat.columns),
        row_metadata_df=pd.DataFrame(index=ssmd_mat.index))
    wg.write(
        ssmd_gct,
        os.path.join(
            args.build_folder,
            args.cohort_name + '_ssmd_matrix_n{}_{}.gct'.format(
                ssmd_gct.data_df.shape[1], ssmd_gct.data_df.shape[0])))

    ssmd_failures = ssmd_gct.data_df.median()[
        ssmd_gct.data_df.median() < 2].index.tolist()
    fails_dict = dict({
        'dropout_failures': failed_plates,
        'ssmd_failures': ssmd_failures
    })
    fails_df = pd.DataFrame(
        dict([(k, pd.Series(v)) for k, v in fails_dict.iteritems()]))
    fails_df.to_csv(os.path.join(args.build_folder, 'failed_plates.txt'),
                    sep='\t',
                    index=False)
Esempio n. 26
0
def main(args):
    """ The main method. """

    # Import gct
    in_gct = parse(args.in_gct_path)

    # Create the separated gcts
    (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field,
                                            args.row_or_col)

    # Save the returned gcts
    for gct, name in zip(out_gcts, out_gct_prefixes):
        full_out_name = os.path.join(
            args.out_dir,
            args.out_name_prefix + str(name) + args.out_name_suffix)
        wg.write(gct,
                 full_out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
Esempio n. 27
0
File: dry.py Progetto: hrk2109/psp
def write_output_gct(gct, out_dir, out_gct_name, data_null, filler_null):
    """Write output gct file.

    Args:
        gct (GCToo object)
        out_dir (string): path to save directory
        out_gct_name (string): name of output gct
        data_null (string): string with which to represent NaN in data
        filler_null (string): string with which to fill the empty top-left quadrant in the output gct

    Returns:
        None

    """
    out_fname = os.path.join(out_dir, out_gct_name)
    wg.write(gct,
             out_fname,
             data_null=data_null,
             filler_null=filler_null,
             data_float_format=None)
Esempio n. 28
0
def main(args):
    """ The main method. """

    # Read test gct
    test_gct = parse(args.test_gct_path, convert_neg_666=False, make_multiindex=True)

    # Read bg_gct
    bg_gct = parse(args.bg_gct_path, convert_neg_666=False, make_multiindex=True)

    # Create an aggregated metadata field for index and columns of both gcts
    # and sort by that field
    (test_df, bg_df) = prepare_multi_index_dfs(
        test_gct.multi_index_df, bg_gct.multi_index_df,
        args.fields_to_aggregate_in_test_gct_queries,
        args.fields_to_aggregate_in_test_gct_targets,
        args.fields_to_aggregate_in_bg_gct,
        QUERY_FIELD_NAME,
        TARGET_FIELD_NAME,
        args.separator)

    # Check symmetry
    (is_test_df_sym, _) = check_symmetry(test_gct.multi_index_df, bg_gct.multi_index_df)

    # Compute connectivity
    (conn_mi_df, signed_conn_mi_df) = compute_connectivities(
        test_df, bg_df, QUERY_FIELD_NAME, TARGET_FIELD_NAME, TARGET_FIELD_NAME,
        args.connectivity_metric, is_test_df_sym)

    # Convert multi-index to component dfs in order to write output gct
    (signed_data_df, signed_row_metadata_df, signed_col_metadata_df) = (
        GCToo.multi_index_df_to_component_dfs(
            signed_conn_mi_df, rid=TARGET_FIELD_NAME, cid=QUERY_FIELD_NAME))

    # Append to queries a new column saying what connectivity metric was used
    add_connectivity_metric_to_metadata(signed_col_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)
    add_connectivity_metric_to_metadata(signed_row_metadata_df, args.connectivity_metric, CONNECTIVITY_METRIC_FIELD)

    # Create gct and write it to file
    conn_gct = GCToo.GCToo(data_df=signed_data_df, row_metadata_df=signed_row_metadata_df, col_metadata_df=signed_col_metadata_df)
    wg.write(conn_gct, args.out_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
Esempio n. 29
0
def main(args):
    """ The main method. """

    # Read test gct
    test_gct = parse(args.test_gct_path)

    # Read bg_gct
    bg_gct = parse(args.bg_gct_path)

    # Check symmetry
    (is_test_df_sym, _) = check_symmetry(test_gct.data_df, bg_gct.data_df)

    # Create an aggregated metadata field in test and background GCTs
    # that will be used to aggregate replicates
    (test_gct, bg_gct) = create_aggregated_fields_in_GCTs(
        test_gct, bg_gct, args.fields_to_aggregate_in_test_gct_queries,
        args.fields_to_aggregate_in_test_gct_targets,
        args.fields_to_aggregate_in_bg_gct, QUERY_FIELD_NAME,
        TARGET_FIELD_NAME, args.separator)

    # Compute connectivity
    (conn_gct, signed_conn_gct) = compute_connectivities(
        test_gct, bg_gct, QUERY_FIELD_NAME, TARGET_FIELD_NAME,
        TARGET_FIELD_NAME, args.connectivity_metric, is_test_df_sym,
        args.separator)

    # Append to queries a new column saying what connectivity metric was used
    add_connectivity_metric_to_metadata(signed_conn_gct.col_metadata_df,
                                        args.connectivity_metric,
                                        CONNECTIVITY_METRIC_FIELD)
    add_connectivity_metric_to_metadata(signed_conn_gct.row_metadata_df,
                                        args.connectivity_metric,
                                        CONNECTIVITY_METRIC_FIELD)

    # Write signed result to file
    wg.write(signed_conn_gct,
             args.out_name,
             data_null="NaN",
             filler_null="NaN",
             metadata_null="NaN")
Esempio n. 30
0
def subset_main(args):
    """ Separate method from main() in order to make testing easier and to
    enable command-line access. """

    # Read in each of the command line arguments
    rid = _read_arg(args.rid)
    cid = _read_arg(args.cid)
    exclude_rid = _read_arg(args.exclude_rid)
    exclude_cid = _read_arg(args.exclude_cid)

    # If GCT, use subset_gctoo
    if args.in_path.endswith(".gct"):

        in_gct = parse_gct.parse(args.in_path)
        out_gct = sg.subset_gctoo(in_gct,
                                  rid=rid,
                                  cid=cid,
                                  exclude_rid=exclude_rid,
                                  exclude_cid=exclude_cid)

    # If GCTx, use parse_gctx
    else:

        if (exclude_rid is not None) or (exclude_cid is not None):
            msg = "exclude_{rid,cid} args not currently supported for parse_gctx."
            raise (Exception(msg))

        logger.info("Using hyperslab selection functionality of parse_gctx...")
        out_gct = parse_gctx.parse(args.in_path, rid=rid, cid=cid)

    # Write the output gct
    if args.out_type == "gctx":
        wgx.write(out_gct, args.out_name)
    else:
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")