Esempio n. 1
0
    def test_main2(self):
        input_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                      "test_introspect_main.gct")
        output_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                       "test_introspect_main_out2.gct")
        expected_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                         "test_introspect_main_expected2.gct")

        args_string = "-i {} -o {} -fa moa".format(input_gct_path,
                                                   output_gct_path)
        args = introspect.build_parser().parse_args(args_string.split())

        introspect.main(args)

        # Read in output and expected gcts and confirm that they're equal
        output_gct = parse.parse(output_gct_path)
        expected_gct = parse.parse(expected_gct_path)

        pd.util.testing.assert_almost_equal(expected_gct.data_df,
                                            output_gct.data_df,
                                            check_less_precise=True)
        pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df,
                                           output_gct.row_metadata_df)
        pd.util.testing.assert_frame_equal(expected_gct.col_metadata_df,
                                           output_gct.col_metadata_df)

        # Clean up
        os.remove(output_gct_path)
Esempio n. 2
0
def main(args):

    # Parse input gcts
    external_gct = parse.parse(args.external_gct_path)
    internal_gct = parse.parse(args.internal_gct_path)
    bg_gct = parse.parse(args.bg_gct_path)

    # Meat of the script
    (sim_gct, conn_gct) = do_steep_and_sip(
        external_gct, internal_gct, bg_gct, args.similarity_metric,
        args.connectivity_metric,
        args.fields_to_aggregate_for_external_profiles,
        args.fields_to_aggregate_for_internal_profiles)

    # Write output gcts
    wg.write(sim_gct,
             args.out_steep_name,
             data_null="NaN",
             metadata_null="NaN",
             filler_null="NaN")
    wg.write(conn_gct,
             args.out_sip_name,
             data_null="NaN",
             filler_null="NaN",
             metadata_null="NaN")
    def test_main(self):
        gct_path = os.path.join(functional_tests_dir,
                                "test_annotate_gct_from_mapping_in.gct")
        mapping_path = os.path.join(functional_tests_dir,
                                    "test_annotate_gct_from_mapping.tsv")
        expected_gct_path = os.path.join(
            functional_tests_dir,
            "test_annotate_gct_from_mapping_expected.gct")
        out_path = os.path.join(functional_tests_dir,
                                "test_annotate_gct_from_mapping_out.gct")

        args_string = "-i {} -m {} -o {} -f {}".format(gct_path, mapping_path,
                                                       out_path, "pert_iname")
        args = agfm.build_parser().parse_args(args_string.split())

        agfm.main(args)

        # Read in expected and actual outputs
        e_gct = parse.parse(expected_gct_path)
        out_gct = parse.parse(out_path)

        pd.util.testing.assert_frame_equal(e_gct.data_df, out_gct.data_df)
        pd.util.testing.assert_frame_equal(e_gct.row_metadata_df,
                                           out_gct.row_metadata_df)
        pd.util.testing.assert_frame_equal(e_gct.col_metadata_df,
                                           out_gct.col_metadata_df)

        # Clean up
        os.remove(out_path)
Esempio n. 4
0
def read_gctx(fname, col_meta=True, row_meta=True, ignore_data_df=False):
  print("  Parsing GCTX file.")

  if ignore_data_df:
    print("  Loading row metadata.")
    rm = parse(fname, row_meta_only=True)
    fix_mangled_byte_literals(rm)
    print("  Loading column metadata.")
    cm = parse(fname, col_meta_only=True)
    fix_mangled_byte_literals(cm)
    cm['sig_num'] = list(range(cm.shape[0]))

    return (None, cm, rm)

  else:
    # Load everything
    print("  IGNORING EXPRESSION SIGNATURES; ONLY LOADING METADATA.")
    print("  If you did not intend to do this, re-run with different arguments.")
    tmp = parse(fname)
    data_df = tmp.data_df

    print("  Fixing mangled byte literals.")
    fix_mangled_byte_literals(data_df)

    if (col_meta):
      print("  Loading column metadata")
      cm = tmp.col_metadata_df
      fix_mangled_byte_literals(cm)
      cm['sig_num'] = list(range(cm.shape[0]))
    if (row_meta):
      print("Loading row metadata")
      rm = tmp.row_metadata_df
      fix_mangled_byte_literals(rm)

    return (data_df, cm, rm)
def main():
"""Parse args and reads and write expression files for paired metadata"""
    args_dict = main_parse_args()
    
    # get list of probes
    probeset_df = pd.read_table(args_dict['probeset_infile'], sep='\t')
    probeset = np.array(map(str, probeset_df['pr_gene_id'].values))
    
    # get list of experiments
    expid_df = pd.read_table(args_dict['expid_infile'], sep='\t', header=None)
    myexpids = np.array(map(str, expid_df[0].values))
    
    # get info about gctx file
    col_metadata = parse.parse(args_dict['gctx_infile'], col_meta_only=True)
    geoexpset = set(col_metadata.index.values)
    
    
    # keep only ids in gctx file
    print("Filtering exp ids for chunk " + str(chunk) + "...")
    validexp_ids = np.array(list(set(myexpids) & geoexpset))
    
    # fetch data from gctx
    print("Fetching chunk " + str(chunk) + "...")
    allexps_gct = parse.parse(args_dict['gctx_infile'], rid=probeset, cid=validexp_ids)
    #returns rows and columns in different order
    print("Gene Ids Order: " + str(list(allexps_gct.data_df.index)))
    
    # merge and write outfile
    print("Writing outfile: "+ args_dict['outfile'])
    write_gctx.write(allexps_gct, args_dict['outfile'])
Esempio n. 6
0
    def test_gct_parsing(self):
        # parse in gct, no other arguments
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct"
        )

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # check convert_neg_666 worked correctly
        self.assertTrue(mg2.col_metadata_df["mfc_plate_id"].isnull().all())

        # parse w/o convert_neg_666
        mg2_alt = parse.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gct",
            convert_neg_666=False)
        self.assertCountEqual(
            mg2_alt.col_metadata_df["mfc_plate_id"].values.tolist(),
            [-666] * 6)

        # parse in gct with subsetting
        my_rid = "LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33"
        mg3 = parse.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gct",
            cidx=[0, 2],
            rid=[my_rid])

        self.assertEqual(mg3.data_df.shape, (1, 2))
        self.assertCountEqual(mg3.data_df.values.flatten().tolist(), [1., 3.])
        self.assertEqual(mg3.row_metadata_df.index[0], my_rid)
Esempio n. 7
0
    def test_subset_main(self):

        in_gct_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_in.gct")
        rid_grp_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_rid.grp")
        out_name = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_out.gct")
        expected_out_path = os.path.join("cmapPy/pandasGEXpress/tests/functional_tests/", "test_subset_expected.gct")

        args_string = "-i {} --rid {} -ec {} -o {}".format(
            in_gct_path, rid_grp_path, "f", out_name)
        args = sg.build_parser().parse_args(args_string.split())

        # Run main method
        sg.subset_main(args)

        # Compare output to expected
        out_gct = parse.parse(out_name)
        expected_gct = parse.parse(expected_out_path)

        pd.util.testing.assert_frame_equal(out_gct.data_df, expected_gct.data_df)
        pd.util.testing.assert_frame_equal(out_gct.row_metadata_df, expected_gct.row_metadata_df)
        pd.util.testing.assert_frame_equal(out_gct.col_metadata_df, expected_gct.col_metadata_df)

        # Clean up
        os.remove(out_name)

        # gctx with exclude_rid should fail
        args_string2 = "-i {} --rid {} -ec {} -o {}".format(
            "FAKE.gctx", rid_grp_path, "f", out_name)
        args2 = sg.build_parser().parse_args(args_string2.split())

        with self.assertRaises(Exception) as e:
            sg.subset_main(args2)
        self.assertIn("exclude_{rid,cid} args not currently supported",
                      str(e.exception))
Esempio n. 8
0
def main(args):

    # Read in the first gct
    gct1 = parse.parse(args.in_gct_path)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse.parse(args.in_gct2_path)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    if os.path.splitext(args.out_name)[1] == ".gct":
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
    elif os.path.splitext(args.out_name)[1] == ".gctx":
        wgx.write(out_gct, args.out_name)
    else:
        raise (Exception(
            "out_name must end in .gct or .gctx. out_name: {}".format(
                args.out_name)))
Esempio n. 9
0
    def setUpClass(cls):
        external_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                         "test_external_query_external.gct")
        cls.external_gct = parse.parse(external_gct_path)

        internal_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                         "test_external_query_internal.gct")
        cls.internal_gct = parse.parse(internal_gct_path)

        bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                   "test_external_query_bg.gct")
        cls.bg_gct = parse.parse(bg_gct_path)
Esempio n. 10
0
def main(args):
    data = pe.parse(args.gct)
    meta = pd.read_table(args.meta, index_col=args.index_col)
    if args.sense is not None:
        wtks(data, meta, args.out, args.sense, group_col=args.prefix_name)
    else:
        wtks(data, meta, args.out, group_col=args.prefix_name)
Esempio n. 11
0
def main():
    # Get args
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)

    # Read the input gct
    in_gct = parse.parse(args.in_gct_path)

    # Read in each of the command line arguments
    rid = _read_arg(args.rid)
    cid = _read_arg(args.cid)
    exclude_rid = _read_arg(args.exclude_rid)
    exclude_cid = _read_arg(args.exclude_cid)

    # Slice the gct
    out_gct = sg.slice_gctoo(in_gct,
                             rid=rid,
                             cid=cid,
                             exclude_rid=exclude_rid,
                             exclude_cid=exclude_cid)
    assert out_gct.data_df.size > 0, "Slicing yielded an empty gct!"

    # Write the output gct
    if args.use_gctx:
        wgx.write(out_gct, args.out_name)
    else:
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
Esempio n. 12
0
def convert_gct_to_config(assay_type, gct_path, output_path):
    """
    Use custom parameters embedded within a GCT to create a config file for processing

    Args -
        gct_path -  where to read in GCT for custom parameters
        assay_type (string) - assay used to specify parameters in config
        output_path - where to write output config
    Returns -
        Nothing - writes output to output path
    """
    gct = parse.parse(gct_path)

    # All rows have same parameters embedded, choose any
    try :
        custom_params = gct.row_metadata_df.loc[:,"pr_processing_params"].any()
    except Exception as error:
        print "GCT does not contain pr_processing_params field"
        return None

    if custom_params == "{}":
        print "GCT contains pr_processing_params field, but it is empty"
        return None

    custom_params = create_dict_from_pseudojson(custom_params)
    differential_parameters = check_custom_parameters_against_defaults(assay_type, custom_params, json=True)
    if differential_parameters is not None:
        write_config(differential_parameters, output_path)

    return differential_parameters
Esempio n. 13
0
    def load_expr_data(self, phase):
        """
        Load differential gene expression profiles (in a dataframe) from one of the two phases of the L1000 dataset
        """
        assert phase in ["phase1", "phase2"]
        if phase == "phase1":
            df_path = os.path.join(self.raw_dir, "dataframe_phase1.pkl")
            file_name = self.raw_file_names[5]
        else:
            df_path = os.path.join(self.raw_dir, "dataframe_phase2.pkl")
            file_name = self.raw_file_names[0]
        if os.path.isfile(df_path):
            pickle_in = open(df_path, "rb")
            expr_data = pickle.load(pickle_in)
        else:  # If the data has not been saved yet, parse the original file and save dataframe
            print("Parsing original data, only happens the first time...")
            from cmapPy.pandasGEXpress.parse import parse

            expr_data = parse(os.path.join(self.raw_dir, file_name),
                              rid=self.landmark_gene_list).data_df.T
            # Ensure that the order of columns corresponds to landmark_gene_list
            expr_data = expr_data[self.landmark_gene_list]
            # Remove rows that are not in sig_info
            expr_data = expr_data[expr_data.index.isin(self.sig_info.index)]

            # Save data
            pickle_out = open(df_path, "wb")
            pickle.dump(expr_data, pickle_out, protocol=2)
            pickle_out.close()
        return expr_data
Esempio n. 14
0
def main(args):

    # Parse gct file
    gct = parse.parse(args.path_to_gct)

    # Parse mapping tsv file
    mapping = pd.read_csv(args.path_to_mapping_tsv, sep="\t", index_col=0)

    # Make sure the ids from the mapping file are unique
    duplicated_bool_array = mapping.index.duplicated()
    assert sum(duplicated_bool_array) == 0, (
        "ids in mapping file must be unique. duplicated ids in mapping:\n{}".
        format(mapping.index[duplicated_bool_array]))

    for col in mapping.columns:

        if args.row_and_or_col == "both":
            annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
            annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
        elif args.row_and_or_col == "row":
            annotate_meta_df(gct.row_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)
        elif args.row_and_or_col == "col":
            annotate_meta_df(gct.col_metadata_df, mapping.loc[:, col],
                             args.gct_from_field, args.missing_entry)

    wg.write(gct,
             args.out_name,
             filler_null="NA",
             data_null="NaN",
             metadata_null="NA")
Esempio n. 15
0
def get_CCLE_Exp_gct_from_selected_genes(selected_gene_list, result_addr=None):
    ccle_GCToo = parse(
        "/Users/woochanghwang/PycharmProjects/LifeArc/General/data/CCLE/CCLE_RNAseq_genes_rpkm_20180929.gct"
    )
    # print(ccle_GCToo.row_metadata_df[:10])
    # print(ccle_GCToo.col_metadata_df[:10])
    # print(ccle_GCToo.data_df.head())

    selected_gene_rid_list = []
    for target_gene in selected_gene_list:
        target_gene_rid = find_gene_rid(ccle_GCToo, target_gene)
        selected_gene_rid_list.append(target_gene_rid)

    selected_gene_expression_df = ccle_GCToo.data_df.loc[
        selected_gene_rid_list, :]
    selected_gene_expression_df_T = selected_gene_expression_df.T
    print(selected_gene_expression_df_T)

    gene_map_dict = dict()
    for gene, rid in zip(selected_gene_list, selected_gene_rid_list):
        gene_map_dict[rid] = gene

    # rename
    selected_gene_expression_df_T = selected_gene_expression_df_T.rename(
        index=str, columns=gene_map_dict)
    selected_gene_expression_df = selected_gene_expression_df_T.T

    if result_addr != None:
        selected_gene_expression_df.to_csv(result_addr,
                                           sep='\t',
                                           quoting=csv.QUOTE_NONE,
                                           index=False)
    return selected_gene_expression_df
Esempio n. 16
0
def read_gct_and_config_file(gct_path, config_path):
    """Read gct and config file.

    The config file has three sections: io, metadata, and parameters.
    These are returned as dictionaries.

    Args:
        gct_path (string): filepath to gct file
        config_path (string): filepath to config file

    Returns:
        gct (GCToo object)
        config_io (dictionary)
        config_metadata (dictionary)
        config_parameters (dictionary)
    """
    assert os.path.exists(os.path.expanduser(config_path))

    # Read config file
    config_parser = ConfigParser.RawConfigParser()
    config_parser.read(os.path.expanduser(config_path))

    # Return config fields as dictionarires
    config_io = dict(config_parser.items("io"))
    config_metadata = dict(config_parser.items("metadata"))
    config_parameters = dict(config_parser.items("parameters"))

    # Parse the gct file and return GCToo object
    gct = parse.parse(gct_path)

    return gct, config_io, config_metadata, config_parameters
Esempio n. 17
0
def main(args):
    """ The main method. """

    # Import gct
    in_gct = parse.parse(args.in_gct_path)

    # Create the separated gcts
    (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field,
                                            args.row_or_col)

    # Save the returned gcts
    for gct, name in zip(out_gcts, out_gct_prefixes):
        full_out_name = os.path.join(
            args.out_dir,
            args.out_name_prefix + str(name) + args.out_name_suffix)

        # Write to GCT or GCTX depending on extension
        if str.lower(os.path.splitext(full_out_name)[1]) == ".gct":
            wg.write(gct,
                     full_out_name,
                     data_null="NaN",
                     metadata_null="NA",
                     filler_null="NA")
        elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx":
            wgx.write(gct, full_out_name)
        else:
            raise (Exception(
                "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}"
                .format((args.out_name_suffix))))
Esempio n. 18
0
File: card.py Progetto: cmap/merino
def reader_writer(input_file, output_file, function, check_size=False):
    plate_failure = False
    # Read in input file
    gctoo = pe.parse(input_file)
    # Call normalizing function on gctoo
    new_gctoo = function(gctoo)

    new_gctoo = drop_nans(new_gctoo)
    if new_gctoo == 'empty_plate':
        logger.debug("{} has no usable data and has not been written.".format(
            os.path.basename(output_file)))
        plate_failure = True
        return plate_failure

    # If told to, check size of new_gctoo and flag if too small
    if new_gctoo.data_df.shape[1] <= 349 and check_size == True:
        logger.debug('{} Plate Failure With {} Failed Wells'.format(
            os.path.basename(os.path.dirname(input_file)),
            384 - new_gctoo.data_df.shape[1]))
        plate_failure = True

    # write out new gctoo
    wgx.write(new_gctoo, out_fname=output_file)
    logger.debug("{} file written.".format(output_file))

    return plate_failure
Esempio n. 19
0
def main(args):

    # Import data
    assert os.path.exists(
        args.in_gct_path), ("in_gct_path could not be found: {}").format(
            args.in_gct_path)
    in_gct = parse.parse(args.in_gct_path)

    # First, check if any rows are all NaN; if so, remove them
    dropped_df = in_gct.data_df.dropna(how="all")
    bools_of_remaining = in_gct.data_df.index.isin(dropped_df.index.values)
    in_gct = sg.subset_gctoo(in_gct, row_bool=bools_of_remaining)

    if args.replace_with == "zero":
        in_gct.data_df.fillna(0, inplace=True)

    elif args.replace_with == "median":
        probe_medians = in_gct.data_df.median(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_medians[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    elif args.replace_with == "mean":
        probe_means = in_gct.data_df.mean(axis=1)

        for row_idx, row in enumerate(in_gct.data_df.values):
            this_row = in_gct.data_df.iloc[row_idx, :]
            this_row[this_row.isnull()] = probe_means[row_idx]
            in_gct.data_df.iloc[row_idx, :] = this_row

    wg.write(in_gct, args.out_name, filler_null="NA")
Esempio n. 20
0
File: card.py Progetto: cmap/merino
def check_ssmds(norm_path, plate_failure):
    norm_gct = pe.parse(norm_path)
    ssmds = ssmd_analysis.get_ssmd(norm_gct, unlog=True)
    ssmd_failures = ssmds[ssmds < 2].count()
    if ssmd_failures > len(ssmds) / 3:
        plate_failure = True
    return plate_failure
Esempio n. 21
0
def gctx_to_pandas(filename, columnlist):
    gctToo = parse(filename, make_multiindex=True)
    df = gctToo.data_df
    df = df.T
    if len(columnlist) > 0:
        df = df[columnlist]
    return (df)
Esempio n. 22
0
    def load_data(self):
        self.df = parse(self.data_path).data_df.T
        # Map gene names
        eh_map = ensg_to_hugo_map()
        columns_to_drop = [i for i in self.df.columns if str(i)[str(i).find('ENS'):].split('.')[0] not in eh_map.keys()]
        self.df = self.df.drop(columns_to_drop, axis=1)  # Drop columns whose gene is not covered by the map

        self.df.columns = [eh_map[str(i)[str(i).find('ENS'):].split('.')[0]] for i in self.df.columns]  # Rename columns
Esempio n. 23
0
def main():
    # get args
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    logger.debug("args:  {}".format(args))

    # Get files directly
    if args.input_filepaths is not None:
        files = args.input_filepaths

    # Or find them
    else:
        files = get_file_list(args.file_wildcard)

        # No files found
        if len(files) == 0:
            msg = "No files were found. args.file_wildcard: {}".format(
                args.file_wildcard)
            logger.error(msg)
            raise Exception(msg)

    # Only 1 file found
    if len(files) == 1:
        logger.warning(
            "Only 1 file found. No concatenation needs to be done, exiting")
        return

    # More than 1 file found
    else:
        # Parse each file and append to a list
        gctoos = []
        for f in files:
            gctoos.append(parse.parse(f))

        # Create concatenated gctoo object
        if args.concat_direction == "horiz":
            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

        elif args.concat_direction == "vert":
            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

    # Write out_gctoo to file
    logger.info("Writing to output file args.out_name:  {}".format(
        args.out_name))

    if args.out_type == "gctx":
        write_gctx.write(out_gctoo, args.out_name)

    elif args.out_type == "gct":
        write_gct.write(out_gctoo,
                        args.out_name,
                        filler_null=args.filler_null,
                        metadata_null=args.metadata_null,
                        data_null=args.data_null)
Esempio n. 24
0
def mk_report(proj_folder, out_folder):
    mar_sense = pd.read_table(os.path.join(out_folder,'sense/expected_sensitivity_ranks.txt'),
                              index_col='det_plate')
    mar_sense = mar_sense / 384
    mar_sense = mar_sense * 100
    for x in pd.Series([y.split('_')[0] for y in mar_sense.columns]).unique():
        mar_sense[x] = mar_sense[[p for p in mar_sense.columns if p.startswith(x)]].median(axis=1)
    mar_sense = mar_sense[pd.Series([y.split('_')[0] for y in mar_sense.columns]).unique()]

    gct_list = [pe.parse(y) for y in
                [x for x in glob.glob(os.path.join(proj_folder,'card/*/*NORM*'))]]
    norm_gct = concat.hstack(gct_list)

    gct_list = [pe.parse(y) for y in
                [x for x in glob.glob(os.path.join(proj_folder,'assemble/*/*MEDIAN*'))]]
    mfi_gct = concat.hstack(gct_list)

    n_recovered = []
    invs = []
    beadsets = []
    plate = []
    med_rank = []
    dropouts = []
    for det_plate in mar_sense.index:
        temp = norm_gct.data_df[[x for x in norm_gct.data_df.columns if x.startswith(det_plate)]]
        dropouts.append(384 - temp.shape[1])
        sigs_recovered = mar_sense.loc[det_plate].dropna()[mar_sense.loc[det_plate].dropna() < 50].count()
        median_rank = mar_sense.loc[det_plate].median()
        temp = mfi_gct.data_df[[x for x in mfi_gct.data_df.columns if x.startswith(det_plate)]]
        median_inv = temp.loc[['c-661', 'c-662', 'c-663', 'c-664']].median(axis=1).median()
        beadset = det_plate.split('_')[-1].split(':')[0]
        n_recovered.append(sigs_recovered)
        invs.append(median_inv)
        beadsets.append(beadset)
        plate.append(det_plate)
        med_rank.append(median_rank)

    mar_df = pd.concat([pd.Series(plate).rename('det_plate'), pd.Series(n_recovered).rename('sigs_recovered_core'),
                        pd.Series(med_rank).rename('median_rank_core'), pd.Series(invs).rename('median_inv'),
                        pd.Series(dropouts).rename('n_dropouts'), pd.Series(beadsets).rename('beadset')], axis=1)
    mar_df.set_index('det_plate', inplace=True)
    mar_ssmd = ssmd_an.ssmd_matrix(
        norm_paths=glob.glob(os.path.join(proj_folder,'card/*/*NORM*')))
    mar_df = mar_df.join(mar_ssmd[mar_ssmd < 2].count().rename('ssmd_failures'))
    return mar_df
Esempio n. 25
0
def load_data(gct_files):
    """ Read a list of GCT files and returns a list
    """
    gct_list = []
    for gct_path in gct_files:
        LOGGER.info('Reading {}'.format(gct_path))
        gct = pe.parse(gct_path)
        gct_list.append(gct)
    return gct_list
Esempio n. 26
0
def main(args):
    # Read GCTs into a list
    gctoo_list = [parse.parse(gct) for gct in args.list_of_gcts]

    # Create superset of all probes in GCTs
    probe_superset = create_probe_superset(gctoo_list)

    # Create pdf in which each page is a probe of the superset
    create_output_pdf(probe_superset, gctoo_list, args.metadata_field, args.output_name)
Esempio n. 27
0
def concat_main(args):
    """ Separate method from main() in order to make testing easier and to
    enable command-line access. """

    # Get files directly
    if args.input_filepaths is not None:
        files = args.input_filepaths

    # Or find them
    else:
        files = get_file_list(args.file_wildcard)

        # No files found
        if len(files) == 0:
            msg = "No files were found. args.file_wildcard: {}".format(
                args.file_wildcard)
            logger.error(msg)
            raise Exception(msg)

    # Only 1 file found
    if len(files) == 1:
        logger.warning(
            "Only 1 file found. No concatenation needs to be done, exiting")
        return

    # More than 1 file found
    else:
        # Parse each file and append to a list
        gctoos = []
        for f in files:
            gctoos.append(parse.parse(f))

        # Create concatenated gctoo object
        if args.concat_direction == "horiz":
            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

        elif args.concat_direction == "vert":
            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

    # Write out_gctoo to file
    logger.info("Writing to output file args.out_name:  {}".format(
        args.out_name))

    if args.out_type == "gctx":
        write_gctx.write(out_gctoo, args.out_name)

    elif args.out_type == "gct":
        write_gct.write(out_gctoo,
                        args.out_name,
                        filler_null=args.filler_null,
                        metadata_null=args.metadata_null,
                        data_null=args.data_null)
Esempio n. 28
0
def run_sensitivities(proj_folder, gmt_path, out_folder):

    gct_list = [pe.parse(y) for y in [x for x in glob.glob(os.path.join(proj_folder,'card/*/*ZSPC.gct'))]]
    fail_gct = concat.hstack(gct_list)
    if not os.path.exists(os.path.join(args.outfolder, 'sense')):
        os.mkdir(os.path.join(args.outfolder, 'sense'))

    sense.wtks(gct=fail_gct, metadata=fail_gct.col_metadata_df,
               outfolder=os.path.join(out_folder, 'sense'), group_col='prism_replicate',
               gmt_path=gmt_path)
Esempio n. 29
0
def main(args):

	gct = parse.parse(args.in_gct_path)

	(_, conn_gct) = do_steep_and_sip(
		gct, args.similarity_metric,
		args.connectivity_metric, args.fields_to_aggregate)

	# Write output gct
	wg.write(conn_gct, args.out_sip_name, data_null="NaN", filler_null="NaN", metadata_null="NaN")
Esempio n. 30
0
    def test_main(self):
        test_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                     "test_sip_in_test.gct")
        bg_gct_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_in_bg.gct")
        out_path = os.path.join(FUNCTIONAL_TESTS_DIR, "test_sip_main_out.gct")

        args_string = "-t {} -b {} -o {} -tfq {} -tft {} -bf {} -s {}".format(
            test_gct_path, bg_gct_path, out_path, "pert_iname", "pert_iname",
            "pert_iname", "|")
        args = sip.build_parser().parse_args(args_string.split())

        # Run main method
        sip.main(args)

        # Compare the output of main with the expected output
        e_out_path = os.path.join(FUNCTIONAL_TESTS_DIR,
                                  "test_sip_expected_conn.gct")
        e_out_gct = parse.parse(e_out_path)
        out_gct = parse.parse(out_path)

        logger.debug("e_out_gct.data_df:\n{}".format(e_out_gct.data_df))
        logger.debug("out_gct.data_df:\n{}".format(out_gct.data_df))
        pd.util.testing.assert_frame_equal(e_out_gct.data_df,
                                           out_gct.data_df,
                                           check_less_precise=3)

        logger.debug("e_out_gct.row_metadata_df:\n{}".format(
            e_out_gct.row_metadata_df))
        logger.debug("out_gct.row_metadata_df:\n{}".format(
            out_gct.row_metadata_df))
        pd.util.testing.assert_frame_equal(e_out_gct.row_metadata_df,
                                           out_gct.row_metadata_df)

        logger.debug("e_out_gct.col_metadata_df:\n{}".format(
            e_out_gct.col_metadata_df))
        logger.debug("out_gct.col_metadata_df:\n{}".format(
            out_gct.col_metadata_df))
        pd.util.testing.assert_frame_equal(e_out_gct.col_metadata_df,
                                           out_gct.col_metadata_df)

        # Remove the created file
        os.remove(out_path)