Esempio n. 1
0
def main(args):
    SEED = 17
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    random.seed(SEED)
    torch.cuda.manual_seed(SEED)

    net = nm.Net()
    path = './trained_model_best.pth'
    d = torch.load(path)
    net.load_state_dict(d['state_dict'])

    net.eval()

    pairs = dl.get_all_pairs(args.data)
    embeddings = {}
    for idx, p in enumerate(pairs):
        if idx % 10000 == 0:
            print(idx)
        with torch.no_grad():
            input = torch.from_numpy(p[1]).view(1, -1)
            embedding = get_embedding(net, input)
            embeddings[p[0]] = embedding.data.numpy()[0]

    df = pandas.DataFrame(data=embeddings)
    out = cmapPy.pandasGEXpress.GCToo.GCToo(df)
    write(out, 'embeddings')
def main():
"""Parse args and reads and write expression files for paired metadata"""
    args_dict = main_parse_args()
    
    # get list of probes
    probeset_df = pd.read_table(args_dict['probeset_infile'], sep='\t')
    probeset = np.array(map(str, probeset_df['pr_gene_id'].values))
    
    # get list of experiments
    expid_df = pd.read_table(args_dict['expid_infile'], sep='\t', header=None)
    myexpids = np.array(map(str, expid_df[0].values))
    
    # get info about gctx file
    col_metadata = parse.parse(args_dict['gctx_infile'], col_meta_only=True)
    geoexpset = set(col_metadata.index.values)
    
    
    # keep only ids in gctx file
    print("Filtering exp ids for chunk " + str(chunk) + "...")
    validexp_ids = np.array(list(set(myexpids) & geoexpset))
    
    # fetch data from gctx
    print("Fetching chunk " + str(chunk) + "...")
    allexps_gct = parse.parse(args_dict['gctx_infile'], rid=probeset, cid=validexp_ids)
    #returns rows and columns in different order
    print("Gene Ids Order: " + str(list(allexps_gct.data_df.index)))
    
    # merge and write outfile
    print("Writing outfile: "+ args_dict['outfile'])
    write_gctx.write(allexps_gct, args_dict['outfile'])
Esempio n. 3
0
def main(args):
    """ The main method. """

    # Import gct
    in_gct = parse.parse(args.in_gct_path)

    # Create the separated gcts
    (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field,
                                            args.row_or_col)

    # Save the returned gcts
    for gct, name in zip(out_gcts, out_gct_prefixes):
        full_out_name = os.path.join(
            args.out_dir,
            args.out_name_prefix + str(name) + args.out_name_suffix)

        # Write to GCT or GCTX depending on extension
        if str.lower(os.path.splitext(full_out_name)[1]) == ".gct":
            wg.write(gct,
                     full_out_name,
                     data_null="NaN",
                     metadata_null="NA",
                     filler_null="NA")
        elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx":
            wgx.write(gct, full_out_name)
        else:
            raise (Exception(
                "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}"
                .format((args.out_name_suffix))))
Esempio n. 4
0
def main():
    # get args
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    logger.debug("args:  {}".format(args))

    # Get files directly
    if args.input_filepaths is not None:
        files = args.input_filepaths

    # Or find them
    else:
        files = get_file_list(args.file_wildcard)

        # No files found
        if len(files) == 0:
            msg = "No files were found. args.file_wildcard: {}".format(
                args.file_wildcard)
            logger.error(msg)
            raise Exception(msg)

    # Only 1 file found
    if len(files) == 1:
        logger.warning(
            "Only 1 file found. No concatenation needs to be done, exiting")
        return

    # More than 1 file found
    else:
        # Parse each file and append to a list
        gctoos = []
        for f in files:
            gctoos.append(parse(f))

        # Create concatenated gctoo object
        if args.concat_direction == "horiz":
            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

        elif args.concat_direction == "vert":
            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

    # Write out_gctoo to file
    logger.info("Writing to output file args.out_name:  {}".format(
        args.out_name))

    if args.out_type == "gctx":
        write_gctx.write(out_gctoo, args.out_name)

    elif args.out_type == "gct":
        write_gct.write(out_gctoo,
                        args.out_name,
                        filler_null=args.filler_null,
                        metadata_null=args.metadata_null,
                        data_null=args.data_null)
Esempio n. 5
0
File: steep.py Progetto: yuanjun/psp
def main(args):

    # Read in the first gct
    gct1 = parse(args.in_gct_path)

    # If second gct provided, compute similarity between 2 gcts
    if args.in_gct2_path is not None:
        logger.info(
            "in_gct2_path was provided. Will compute pairwise similarities " +
            "between the columns of in_gct and in_gct2.")

        # Read in the second gct
        gct2 = parse(args.in_gct2_path)

        # Compute similarities between gct1 and gct2
        out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df,
                                               args.similarity_metric)

        # Row metadata is from gct1, column metadata is from gct2
        row_metadata_df = gct1.col_metadata_df
        col_metadata_df = gct2.col_metadata_df

        # Append column to both metadata_dfs indicating which similarity_metric was used
        row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric
        col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df)

    # If only 1 gct provided, compute similarities between the columns of gct1
    else:
        out_df = compute_similarity_within_df(gct1.data_df,
                                              args.similarity_metric)

        # Row and column metadata are both from gct1
        metadata_df = gct1.col_metadata_df

        # Append column to metadata_df indicating which similarity_metric was used
        metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric

        # Assemble output gct
        out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df)

    # Write output gct
    if os.path.splitext(args.out_name)[1] == ".gct":
        wg.write(out_gct,
                 args.out_name,
                 data_null="NaN",
                 metadata_null="NA",
                 filler_null="NA")
    elif os.path.splitext(args.out_name)[1] == ".gctx":
        wgx.write(out_gct, args.out_name)
    else:
        raise (Exception(
            "out_name must end in .gct or .gctx. out_name: {}".format(
                args.out_name)))
Esempio n. 6
0
def concat_main(args):
    """ Separate method from main() in order to make testing easier and to
    enable command-line access. """

    # Get files directly
    if args.input_filepaths is not None:
        files = args.input_filepaths

    # Or find them
    else:
        files = get_file_list(args.file_wildcard)

        # No files found
        if len(files) == 0:
            msg = "No files were found. args.file_wildcard: {}".format(
                args.file_wildcard)
            logger.error(msg)
            raise Exception(msg)

    # Only 1 file found
    if len(files) == 1:
        logger.warning(
            "Only 1 file found. No concatenation needs to be done, exiting")
        return

    # More than 1 file found
    else:
        # Parse each file and append to a list
        gctoos = []
        for f in files:
            gctoos.append(parse.parse(f))

        # Create concatenated gctoo object
        if args.concat_direction == "horiz":
            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

        elif args.concat_direction == "vert":
            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields,
                               args.error_report_output_file,
                               args.fields_to_remove, args.reset_ids)

    # Write out_gctoo to file
    logger.info("Writing to output file args.out_name:  {}".format(
        args.out_name))

    if args.out_type == "gctx":
        write_gctx.write(out_gctoo, args.out_name)

    elif args.out_type == "gct":
        write_gct.write(out_gctoo,
                        args.out_name,
                        filler_null=args.filler_null,
                        metadata_null=args.metadata_null,
                        data_null=args.data_null)
Esempio n. 7
0
def main():
    args = build_parser().parse_args(sys.argv[1:])
    setup_logger.setup(verbose=args.verbose)
    in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
    logger.debug("Original out name: {}".format(in_gctoo.src))

    if args.output_filepath == None:
        out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
    else:
        out_name = args.output_filepath

    write_gctx.write(in_gctoo, out_name)
Esempio n. 8
0
def gct2gctx_main(args):
    """ Separate from main() in order to make command-line tool. """

    in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)

    if args.output_filepath is None:
        basename = os.path.basename(args.filename)
        out_name = os.path.splitext(basename)[0] + ".gctx"
    else:
        out_name = args.output_filepath

    write_gctx.write(in_gctoo, out_name)
Esempio n. 9
0
def build(search_pattern, outfile, file_suffix, cut=True, check_size=False):
    gct_list = glob.glob(search_pattern)
    old_len = len(gct_list)

    if cut == True:
        gct_list = cut_to_l2.cut_l1(gct_list)

    new_len = len(gct_list)

    logger.info('Number of old lysate plates removed = {}'.format(old_len -
                                                                  new_len))

    if new_len == 0:
        return
    gcts = []
    failure_list = []
    for gct in gct_list:
        temp = pe.parse(gct)
        gcts.append(temp)
        if temp.data_df.shape[1] <= 349 and check_size == True:
            failure_list.append(os.path.basename(gct).replace('_NORM.gct', ''))

    for ct in gcts:
        ct.row_metadata_df = gcts[0].row_metadata_df

    fields_to_remove = [
        x for x in gcts[0].row_metadata_df.columns
        if x in ['det_plate', 'det_plate_scan_time', 'assay_plate_barcode']
    ]

    concat_gct = cg.hstack(gcts,
                           False,
                           None,
                           fields_to_remove=fields_to_remove)

    concat_gct_wo_meta = GCToo.GCToo(
        data_df=concat_gct.data_df,
        row_metadata_df=pd.DataFrame(index=concat_gct.data_df.index),
        col_metadata_df=pd.DataFrame(index=concat_gct.col_metadata_df.index))

    logger.debug("gct shape without metadata: {}".format(
        concat_gct_wo_meta.data_df.shape))

    wgx.write(
        concat_gct_wo_meta,
        outfile + 'n{}x{}'.format(concat_gct.data_df.shape[1],
                                  concat_gct.data_df.shape[0]) + file_suffix)

    return concat_gct, failure_list
Esempio n. 10
0
    def test_write_src(self):
        # case 1: gctoo obj doesn't have src
        mini1 = mini_gctoo_for_testing.make()
        mini1.src = None
        write_gctx.write(mini1, "no_src_example")
        hdf5_file = h5py.File("no_src_example.gctx")
        hdf5_src1 = hdf5_file.attrs[write_gctx.src_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_src1, "no_src_example.gctx")
        os.remove("no_src_example.gctx")

        # case 2: gctoo obj does have src
        mini2 = mini_gctoo_for_testing.make()
        write_gctx.write(mini2, "with_src_example.gctx")
        hdf5_file = h5py.File("with_src_example.gctx")
        hdf5_src2 = hdf5_file.attrs[write_gctx.src_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_src2, "mini_gctoo.gctx")
        os.remove("with_src_example.gctx")
Esempio n. 11
0
def reduce_and_save():
    """
    Reads in the level 5 data and outputs a file with only the landmark gene z-scores(rows and the small molecule
     perterbagens (cols)
    """
    ### Get the signature information
    sig_info = pd.read_csv(join(FILE_PATH,
                                "GSE92742_Broad_LINCS_sig_info.txt"),
                           sep="\t")
    ### Columns are:
    ###  Index([u'sig_id', u'pert_id', u'pert_iname', u'pert_type', u'cell_id',
    ###       u'pert_dose', u'pert_dose_unit', u'pert_idose', u'pert_time',
    ###       u'pert_time_unit', u'pert_itime', u'distil_id'],
    ###      dtype='object')

    ### Filter for signature ids for small molecule pertubagens
    small_mol_sigs = sig_info['sig_id'][sig_info['pert_type'] == "trt_cp"]
    ### Results in 205034 signatures

    ### Read in the gene info
    gene_info = pd.read_csv(join(FILE_PATH,
                                 "GSE92742_Broad_LINCS_gene_info.txt"),
                            sep='\t')
    ### Index([u'pr_gene_id', u'pr_gene_symbol', u'pr_gene_title', u'pr_is_lm',
    ###      u'pr_is_bing'],
    ###      dtype='object')

    landmark_gene_ids = gene_info['pr_gene_id'][
        gene_info['pr_is_lm'] == 1]  #Filters for directly measured transcripts
    ### Results in the 978 landmark pr_gene_ids

    ### LOAD in the main file filtering the columns so that only the small molecules signatures are loaded and the
    ### rows such that only the landmark genes are loaded into their custom gctoo container type
    relevent_sigs_gctoo = parse(join(
        FILE_PATH,
        "GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"),
                                cid=small_mol_sigs,
                                rid=landmark_gene_ids)
    # print small_mol_sigs.data_df.shape
    ### Should write an intermediate file with dimensions (978, 205034)
    write_gctx.write(relevent_sigs_gctoo, join(FILE_PATH, "lm_sm_aggz"))
Esempio n. 12
0
def gct2gctx_main(args):
    """ Separate from main() in order to make command-line tool. """

    in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)

    if args.output_filepath is None:
        basename = os.path.basename(args.filename)
        out_name = os.path.splitext(basename)[0] + ".gctx"
    else:
        out_name = args.output_filepath
    """ If annotations are supplied, parse table and set metadata_df """
    if args.row_annot_path is None:
        pass
    else:
        row_metadata = pd.read_csv(args.row_annot_path,
                                   sep='\t',
                                   index_col=0,
                                   header=0,
                                   low_memory=False)
        assert all(in_gctoo.data_df.index.isin(row_metadata.index)), \
            "Row ids in matrix missing from annotations file"
        in_gctoo.row_metadata_df = row_metadata.loc[row_metadata.index.isin(
            in_gctoo.data_df.index)]

    if args.col_annot_path is None:
        pass
    else:
        col_metadata = pd.read_csv(args.col_annot_path,
                                   sep='\t',
                                   index_col=0,
                                   header=0,
                                   low_memory=False)
        assert all(in_gctoo.data_df.columns.isin(col_metadata.index)), \
            "Column ids in matrix missing from annotations file"
        in_gctoo.col_metadata_df = col_metadata.loc[col_metadata.index.isin(
            in_gctoo.data_df.columns)]

    write_gctx.write(in_gctoo, out_name)
Esempio n. 13
0
    def test_write_version(self):
        # TODO @oana refactor this test so it just calls the write_version method
        # case 1: gctoo obj doesn't have version
        mini1 = mini_gctoo_for_testing.make()
        mini1.version = None
        fn = "no_version_provided_example.gctx"
        write_gctx.write(mini1, fn)
        hdf5_file = h5py.File(fn)
        hdf5_v1 = hdf5_file.attrs[write_gctx.version_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_v1, write_gctx.version_number)
        os.remove(fn)

        # case 2: gctoo obj does have version, but it is not used when writing
        mini2 = mini_gctoo_for_testing.make()
        mini2.version = "MY_VERSION"
        fn = "with_version_provided_example.gctx"
        write_gctx.write(mini2, fn)
        hdf5_file = h5py.File(fn)
        hdf5_v2 = hdf5_file.attrs[write_gctx.version_attr]
        hdf5_file.close()
        self.assertEqual(hdf5_v2, write_gctx.version_number)
        os.remove(fn)
    def test_write_gctx(self):
        out_name = os.path.join(FUNCTIONAL_TESTS_PATH, 'test_write_out_py2py3.gctx')

        gctoo = GCToo.GCToo(data_df=self.data_df,
                            row_metadata_df=self.row_metadata_df,
                            col_metadata_df=self.col_metadata_df)
        write_gctx.write(gctoo, out_name,
                         convert_back_to_neg_666=True, gzip_compression_level=6,
                         max_chunk_kb=1024, matrix_dtype=np.float32)

        # Read in the gct and verify that it's the same as gctoo
        # re-ininitalising gctooo because write_gctx is changing dtype of one column of col_metadata_df
        gctoo = GCToo.GCToo(data_df=self.data_df,
                            row_metadata_df=self.row_metadata_df,
                            col_metadata_df=self.col_metadata_df)

        new_gctx = parse_gctx.parse(out_name)

        pd.testing.assert_frame_equal(new_gctx.data_df, gctoo.data_df)
        pd.testing.assert_frame_equal(new_gctx.row_metadata_df, gctoo.row_metadata_df)
        pd.testing.assert_frame_equal(new_gctx.col_metadata_df, gctoo.col_metadata_df)

        # Cleanup
        os.remove(out_name)
Esempio n. 15
0
def write_gctx(data, ofile):
    dataJON = {}
    # Create a numpy matrix
    bfpt = [data[x]['binary_fpt'] for x in range(len(data))]
    bfpta = numpy.transpose(numpy.array(bfpt, dtype='i8'))
    print(bfpta.shape)
    #bfpta = numpy.transpose(numpy.array(bfpt, dtype='bool'))
    # Create column desc
    dataJO = copy.deepcopy(data)
    #[dataJO[x].pop('pert_id') for x in range(len(dataJO))]
    #[dataJO[x].pop('binary_fpt') for x in range(len(dataJO))]
    #for dkey in dataJO[0].keys():
    dataJON['pert_iname'] = [
        dataJO[x]['pert_iname'] for x in range(len(dataJO))
    ]
    # Create cid and rid
    cid = [data[x]['pert_id'] for x in range(bfpta.shape[1])]
    rid = ["bit" + str(x + 1) for x in range(bfpta.shape[0])]
    data_df = pd.DataFrame(bfpta,
                           index=pd.Index(rid, name="rid"),
                           columns=pd.Index(cid, name="cid"))
    #print(data_df.head())
    # TOADD Column metadata df
    row_df = pd.DataFrame(index=rid)
    col_df = pd.DataFrame(index=cid)

    #print(row_df)
    #print(col_df)

    #gcto(bfpta, rid, cid, {}, dataJON)
    #gcto.write(ofile, 'gctx')
    gco = gctoo.GCToo(data_df=data_df,
                      row_metadata_df=row_df,
                      col_metadata_df=col_df)
    #wg.write(gco, ofile)
    wgx.write(gco, ofile)
Esempio n. 16
0
    def test_parse(self):
        # parse whole thing
        mg1 = mini_gctoo_for_testing.make()
        mg2 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx"
        )

        pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df)
        pandas_testing.assert_frame_equal(mg1.row_metadata_df,
                                          mg2.row_metadata_df)
        pandas_testing.assert_frame_equal(mg1.col_metadata_df,
                                          mg2.col_metadata_df)

        # test with string rid/cid
        test_rids = [
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33',
            'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'
        ]
        test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
        mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids)
        mg4 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=test_rids,
            cid=test_cids)
        pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
        pandas_testing.assert_frame_equal(mg3.row_metadata_df,
                                          mg4.row_metadata_df)
        pandas_testing.assert_frame_equal(mg3.col_metadata_df,
                                          mg4.col_metadata_df)

        # first, make & write out temp version of mini_gctoo with int rids/cids
        new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
        int_indexed_data_df = new_mg.data_df.copy()
        int_indexed_data_df.index = [str(i) for i in range(0, 6)]
        int_indexed_data_df.columns = [str(i) for i in range(10, 16)]

        int_indexed_row_meta = new_mg.row_metadata_df.copy()
        int_indexed_row_meta.index = int_indexed_data_df.index

        int_indexed_col_meta = new_mg.col_metadata_df.copy()
        int_indexed_col_meta.index = int_indexed_data_df.columns

        int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df,
                                        row_metadata_df=int_indexed_row_meta,
                                        col_metadata_df=int_indexed_col_meta)

        write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

        # test with numeric (repr as string) rid/cid
        mg5 = GCToo.GCToo(data_df=int_indexed_data_df,
                          row_metadata_df=int_indexed_row_meta,
                          col_metadata_df=int_indexed_col_meta)
        mg5 = subset_gctoo.subset_gctoo(
            mg5,
            row_bool=[True, False, True, False, True, False],
            col_bool=[True, False, False, True, True, True])

        mg5.data_df.index.name = "rid"
        mg5.data_df.columns.name = "cid"

        mg5.row_metadata_df.index.name = "rid"
        mg5.row_metadata_df.columns.name = "rhd"

        mg5.col_metadata_df.index.name = "cid"
        mg5.col_metadata_df.columns.name = "chd"

        mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx",
                               rid=["0", "2", "4"],
                               cid=["10", "13", "14", "15"],
                               convert_neg_666=False)

        os.remove("int_indexed_mini_gctoo.gctx")

        pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df)
        pandas_testing.assert_frame_equal(mg5.row_metadata_df,
                                          mg6.row_metadata_df)
        pandas_testing.assert_frame_equal(mg5.col_metadata_df,
                                          mg6.col_metadata_df)

        # test with ridx/cidx
        mg7 = subset_gctoo.subset_gctoo(
            mg1,
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])
        mg8 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg8.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg8.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg8.col_metadata_df)

        # test with rid/cidx
        mg9 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
            cidx=[4])

        pandas_testing.assert_frame_equal(mg7.data_df, mg9.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg9.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg9.col_metadata_df)

        # test with ridx/cid
        mg10 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            ridx=[4],
            cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

        pandas_testing.assert_frame_equal(mg7.data_df, mg10.data_df)
        pandas_testing.assert_frame_equal(mg7.row_metadata_df,
                                          mg10.row_metadata_df)
        pandas_testing.assert_frame_equal(mg7.col_metadata_df,
                                          mg10.col_metadata_df)

        # test with row_meta_only
        mg11 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            row_meta_only=True)
        pandas_testing.assert_frame_equal(mg11, mg1.row_metadata_df)

        # test with col_meta_only
        mg12 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            col_meta_only=True)
        pandas_testing.assert_frame_equal(mg12, mg1.col_metadata_df)

        # test with sort_row_meta False and ridx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
        )

        # test with sort_col_meta False and cidx
        mg13 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False)

        pandas_testing.assert_frame_equal(mg13.data_df,
                                          mg1.data_df.iloc[:, [4, 1, 3]])
        pandas_testing.assert_frame_equal(
            mg13.col_metadata_df, mg1.col_metadata_df.iloc[[4, 1, 3], :])
        pandas_testing.assert_frame_equal(mg13.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and ridx
        mg14 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False)

        pandas_testing.assert_frame_equal(mg14.data_df,
                                          mg1.data_df.iloc[[3, 0, 1], :])
        pandas_testing.assert_frame_equal(mg14.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(
            mg14.row_metadata_df, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cidx and col_meta_only
        mg15 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cidx=[4, 1, 3],
            sort_col_meta=False,
            col_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg15, mg1.col_metadata_df.iloc[[4, 1, 3], :])

        # test with sort_row_meta False and ridx and row_meta_only
        mg16 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            ridx=[3, 0, 1],
            sort_row_meta=False,
            row_meta_only=True)
        pandas_testing.assert_frame_equal(
            mg16, mg1.row_metadata_df.iloc[[3, 0, 1], :])

        # test with sort_col_meta False and cid
        cid_unsorted = [
            'LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10',
            'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg17 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx",
            cid=cid_unsorted,
            sort_col_meta=False)
        pandas_testing.assert_frame_equal(mg17.data_df,
                                          mg1.data_df.iloc[:, [2, 0]])
        pandas_testing.assert_frame_equal(mg17.col_metadata_df,
                                          mg1.col_metadata_df.iloc[[2, 0], :])
        pandas_testing.assert_frame_equal(mg17.row_metadata_df,
                                          mg1.row_metadata_df)

        # test with sort_row_meta False and rid
        rid_unsorted = [
            'LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10',
            'MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33'
        ]
        mg18 = parse_gctx.parse(
            "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx",
            rid=rid_unsorted,
            sort_row_meta=False)
        pandas_testing.assert_frame_equal(mg18.data_df,
                                          mg1.data_df.iloc[[5, 1], :])
        pandas_testing.assert_frame_equal(mg18.col_metadata_df,
                                          mg1.col_metadata_df)
        pandas_testing.assert_frame_equal(mg18.row_metadata_df,
                                          mg1.row_metadata_df.iloc[[5, 1], :])
Esempio n. 17
0
	def test_parse(self):
		# parse whole thing 
		mg1 = mini_gctoo_for_testing.make()
		mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx")

		assert_frame_equal(mg1.data_df, mg2.data_df)
		assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df)
		assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df)

		# test with string rid/cid 
		test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33','LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
		test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
		mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
		mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
			rid = test_rids, cid = test_cids)
		assert_frame_equal(mg3.data_df, mg4.data_df)
		assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
		assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)

		# first, make & write out temp version of mini_gctoo with int rids/cids 
		new_mg = mini_gctoo_for_testing.make(convert_neg_666=False)
		int_indexed_data_df = new_mg.data_df.copy()
		int_indexed_data_df.index = range(0,6)
		int_indexed_data_df.columns = range(10,16)

		int_indexed_row_meta = new_mg.row_metadata_df.copy()
		int_indexed_row_meta.index = range(0,6)

		int_indexed_col_meta = new_mg.col_metadata_df.copy()
		int_indexed_col_meta.index = range(10,16)

		int_indexed_gctoo = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta,
			col_metadata_df = int_indexed_col_meta)

		write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx")

		# test with numeric (repr as string) rid/cid
		mg5 = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta, 
			col_metadata_df = int_indexed_col_meta)
		mg5 = slice_gct.slice_gctoo(mg5, row_bool = [True, False, True, False, True, False],
			col_bool = [True, False, False, True, True, True])

		mg5.data_df.index.name = "rid"
		mg5.data_df.columns.name = "cid"

		mg5.row_metadata_df.index.name = "rid"
		mg5.row_metadata_df.columns.name = "rhd"

		mg5.col_metadata_df.index.name = "cid"
		mg5.col_metadata_df.columns.name = "chd"

		mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid = [0, 2, 4], 
			cid = [10,13,14,15], convert_neg_666=False)

		os.remove("int_indexed_mini_gctoo.gctx")

		assert_frame_equal(mg5.data_df, mg6.data_df)
		assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df)
		assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)		

		# test with ridx/cidx
		mg7 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], 
			cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
		mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])

		assert_frame_equal(mg7.data_df, mg8.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df)			

		# test with rid/cidx
		mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
			cidx = [4])

		assert_frame_equal(mg7.data_df, mg9.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df)			

		# test with ridx/cid
		mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4],
			cid = ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'])

		assert_frame_equal(mg7.data_df, mg10.data_df)
		assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df)
		assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)			
Esempio n. 18
0
gene_path = 'GSE92742/GSE92742_Broad_LINCS_gene_info.txt'
sig_path = 'GSE92742/GSE92742_Broad_LINCS_sig_info.txt'
data_path = 'GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx'

genesymbol = raw_input("Enter Gene Symbol: ")
geneid = raw_input("Enter Gene ID: ")

#GET gene data from datamatrix
if not os.path.exists("raw_%s.gctx" % genesymbol):
    print "Extract Gene Raw Data... "
    gene_info = pd.read_csv(gene_path, sep='\t', dtype=str)
    landmark_gene_row_ids = gene_info["pr_gene_id"][gene_info["pr_gene_symbol"]
                                                    == genesymbol]
    landmark_only_gctoo = parse(data_path, rid=landmark_gene_row_ids)
    wg.write(landmark_only_gctoo, "raw_%s.gctx" % genesymbol)
    print "Finished!"

print "Analysing Data... "
#GET sig info
sig_info = pd.read_csv(sig_path, sep='\t', dtype=str)
itemlist = [
    u'sig_id', u'pert_id', u'pert_iname', u'pert_type', u'cell_id',
    u'pert_idose', u'pert_itime'
]
annolist = np.vstack([sig_info[item].values for item in itemlist])
#print sig_info.columns
#print annolist
#print np.unique(sig_info[u'pert_type'].values,return_counts = True)
index = dict([[annolist[0, i], annolist[:, i]]
              for i in range(len(annolist[0, :]))])
Esempio n. 19
0
from cmapPy.pandasGEXpress.parse import parse
from cmapPy.pandasGEXpress.write_gctx import write
from cmapPy.pandasGEXpress.GCToo import GCToo

print("Load data")
dset_full = parse(
    "C:/users/jdr2160/venomseq_data/annotated_GSE92742_Broad_LINCS_Level5_COMPZ_n473647x12328.gctx"
)
cm_file = "C:/data/out_cm.h5"
rm_file = "C:/data/out_rm.h5"

cm_full = pd.read_hdf(cm_file)
rm_full = pd.read_hdf(rm_file)

print("Set mask")
mask = cm_full.pert_type.isin(['trt_cp', 'ctl_untrt'])

print("Subset data")
df_sub = dset_full.data_df.iloc[:, np.array(mask)]
cm_sub = cm_full.iloc[np.array(mask), :]
df_sub.columns = [c[2:-1] for c in df_sub.columns]
df_sub.index = [r[2:-1] for r in df_sub.index]

print("Write to disk")
out_gctx = GCToo(data_df=df_sub,
                 row_metadata_df=rm_full,
                 col_metadata_df=cm_sub)

write(out_gctx, "C:/data/GSE92742_cps_level5.gctx")
Esempio n. 20
0
row_spaces = [978, 10174]

for c in col_spaces:
    for r in row_spaces:
        curr_gctoo = sg.subset_gctoo(big_gctoo,
                                     ridx=range(0, r),
                                     cidx=range(0, c))
        # gct writing
        out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gct"
        start = time.clock()
        write_gct.write(curr_gctoo, out_fname)
        end = time.clock()
        elapsed_time = end - start
        gct_times[out_fname] = elapsed_time
        os.remove(out_fname)
        # gctx writing
        out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gctx"
        start = time.clock()
        write_gctx.write(curr_gctoo, out_fname)
        end = time.clock()
        elapsed_time = end - start
        gctx_times[out_fname] = elapsed_time
        os.remove(out_fname)

# write results to file
gct_df = pd.DataFrame(pd.Series(gct_times))
gctx_df = pd.DataFrame(pd.Series(gctx_times))
write_times_df = pd.concat([gct_df, gctx_df])
write_times_df.columns = ["write_time"]
write_times_df.to_csv("python_writing_results.txt", sep="\t")
Esempio n. 21
0
            #idx_both= intersect(idx_idose,idx_itime)

            #if( len(idx_both) != 0):
            #	ctl_ids = sig_info_cell_vehicle["sig_id"][idx_both]
            #elif( len (idx_idose) != 0):
            #	ctl_ids = sig_info_cell_vehicle["sig_id"][idx_idose]
            #else:
            ctl_ids = sig_info_cell_vehicle["sig_id"]

            tot_ids = ctl_ids.tolist() + pert_ids.tolist()
            print("cell = %s, pert_id= %s \nN_pert=%d, N_ctl=%d" %
                  (cell, "nothing", len(pert_ids), len(ctl_ids)))
            data = parse(
                "../Data/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx",
                cid=tot_ids)
            wg.write(data, "Subset_GCT/GSE92742_Level5_%s_%d" % (cell, i))

            count = 0
            for pert_id in pert_ids_names:
                for dose in dose_cp_list:
                    for time in time_cp_list:
                        pert_ids_cls = sig_info_cell_cp["sig_id"][
                            (sig_info_cell_cp["pert_id"] == pert_id)
                            & (sig_info_cell_cp["pert_idose"] == dose) &
                            (sig_info_cell_cp["pert_itime"] == time)]
                        if (len(pert_ids_cls) != 0):
                            ctl_ids_cls = sig_info_cell_vehicle["sig_id"][
                                (sig_info_cell_vehicle["pert_id"] == pert_id)
                                & (sig_info_cell_vehicle["pert_idose"] == dose)
                                &
                                (sig_info_cell_vehicle["pert_itime"] == time)]