def main(args): SEED = 17 np.random.seed(SEED) torch.manual_seed(SEED) random.seed(SEED) torch.cuda.manual_seed(SEED) net = nm.Net() path = './trained_model_best.pth' d = torch.load(path) net.load_state_dict(d['state_dict']) net.eval() pairs = dl.get_all_pairs(args.data) embeddings = {} for idx, p in enumerate(pairs): if idx % 10000 == 0: print(idx) with torch.no_grad(): input = torch.from_numpy(p[1]).view(1, -1) embedding = get_embedding(net, input) embeddings[p[0]] = embedding.data.numpy()[0] df = pandas.DataFrame(data=embeddings) out = cmapPy.pandasGEXpress.GCToo.GCToo(df) write(out, 'embeddings')
def main(): """Parse args and reads and write expression files for paired metadata""" args_dict = main_parse_args() # get list of probes probeset_df = pd.read_table(args_dict['probeset_infile'], sep='\t') probeset = np.array(map(str, probeset_df['pr_gene_id'].values)) # get list of experiments expid_df = pd.read_table(args_dict['expid_infile'], sep='\t', header=None) myexpids = np.array(map(str, expid_df[0].values)) # get info about gctx file col_metadata = parse.parse(args_dict['gctx_infile'], col_meta_only=True) geoexpset = set(col_metadata.index.values) # keep only ids in gctx file print("Filtering exp ids for chunk " + str(chunk) + "...") validexp_ids = np.array(list(set(myexpids) & geoexpset)) # fetch data from gctx print("Fetching chunk " + str(chunk) + "...") allexps_gct = parse.parse(args_dict['gctx_infile'], rid=probeset, cid=validexp_ids) #returns rows and columns in different order print("Gene Ids Order: " + str(list(allexps_gct.data_df.index))) # merge and write outfile print("Writing outfile: "+ args_dict['outfile']) write_gctx.write(allexps_gct, args_dict['outfile'])
def main(args): """ The main method. """ # Import gct in_gct = parse.parse(args.in_gct_path) # Create the separated gcts (out_gcts, out_gct_prefixes) = separate(in_gct, args.separate_field, args.row_or_col) # Save the returned gcts for gct, name in zip(out_gcts, out_gct_prefixes): full_out_name = os.path.join( args.out_dir, args.out_name_prefix + str(name) + args.out_name_suffix) # Write to GCT or GCTX depending on extension if str.lower(os.path.splitext(full_out_name)[1]) == ".gct": wg.write(gct, full_out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif str.lower(os.path.splitext(full_out_name)[1]) == ".gctx": wgx.write(gct, full_out_name) else: raise (Exception( "out_name_suffix must end in either .gct or .gctx. out_name_suffix: {}" .format((args.out_name_suffix))))
def main(): # get args args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) logger.debug("args: {}".format(args)) # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def main(args): # Read in the first gct gct1 = parse(args.in_gct_path) # If second gct provided, compute similarity between 2 gcts if args.in_gct2_path is not None: logger.info( "in_gct2_path was provided. Will compute pairwise similarities " + "between the columns of in_gct and in_gct2.") # Read in the second gct gct2 = parse(args.in_gct2_path) # Compute similarities between gct1 and gct2 out_df = compute_similarity_bw_two_dfs(gct1.data_df, gct2.data_df, args.similarity_metric) # Row metadata is from gct1, column metadata is from gct2 row_metadata_df = gct1.col_metadata_df col_metadata_df = gct2.col_metadata_df # Append column to both metadata_dfs indicating which similarity_metric was used row_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric col_metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, row_metadata_df, col_metadata_df) # If only 1 gct provided, compute similarities between the columns of gct1 else: out_df = compute_similarity_within_df(gct1.data_df, args.similarity_metric) # Row and column metadata are both from gct1 metadata_df = gct1.col_metadata_df # Append column to metadata_df indicating which similarity_metric was used metadata_df[SIMILARITY_METRIC_FIELD] = args.similarity_metric # Assemble output gct out_gct = GCToo.GCToo(out_df, metadata_df, metadata_df) # Write output gct if os.path.splitext(args.out_name)[1] == ".gct": wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA") elif os.path.splitext(args.out_name)[1] == ".gctx": wgx.write(out_gct, args.out_name) else: raise (Exception( "out_name must end in .gct or .gctx. out_name: {}".format( args.out_name)))
def concat_main(args): """ Separate method from main() in order to make testing easier and to enable command-line access. """ # Get files directly if args.input_filepaths is not None: files = args.input_filepaths # Or find them else: files = get_file_list(args.file_wildcard) # No files found if len(files) == 0: msg = "No files were found. args.file_wildcard: {}".format( args.file_wildcard) logger.error(msg) raise Exception(msg) # Only 1 file found if len(files) == 1: logger.warning( "Only 1 file found. No concatenation needs to be done, exiting") return # More than 1 file found else: # Parse each file and append to a list gctoos = [] for f in files: gctoos.append(parse.parse(f)) # Create concatenated gctoo object if args.concat_direction == "horiz": out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) elif args.concat_direction == "vert": out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file, args.fields_to_remove, args.reset_ids) # Write out_gctoo to file logger.info("Writing to output file args.out_name: {}".format( args.out_name)) if args.out_type == "gctx": write_gctx.write(out_gctoo, args.out_name) elif args.out_type == "gct": write_gct.write(out_gctoo, args.out_name, filler_null=args.filler_null, metadata_null=args.metadata_null, data_null=args.data_null)
def main(): args = build_parser().parse_args(sys.argv[1:]) setup_logger.setup(verbose=args.verbose) in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False) logger.debug("Original out name: {}".format(in_gctoo.src)) if args.output_filepath == None: out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0] else: out_name = args.output_filepath write_gctx.write(in_gctoo, out_name)
def gct2gctx_main(args): """ Separate from main() in order to make command-line tool. """ in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False) if args.output_filepath is None: basename = os.path.basename(args.filename) out_name = os.path.splitext(basename)[0] + ".gctx" else: out_name = args.output_filepath write_gctx.write(in_gctoo, out_name)
def build(search_pattern, outfile, file_suffix, cut=True, check_size=False): gct_list = glob.glob(search_pattern) old_len = len(gct_list) if cut == True: gct_list = cut_to_l2.cut_l1(gct_list) new_len = len(gct_list) logger.info('Number of old lysate plates removed = {}'.format(old_len - new_len)) if new_len == 0: return gcts = [] failure_list = [] for gct in gct_list: temp = pe.parse(gct) gcts.append(temp) if temp.data_df.shape[1] <= 349 and check_size == True: failure_list.append(os.path.basename(gct).replace('_NORM.gct', '')) for ct in gcts: ct.row_metadata_df = gcts[0].row_metadata_df fields_to_remove = [ x for x in gcts[0].row_metadata_df.columns if x in ['det_plate', 'det_plate_scan_time', 'assay_plate_barcode'] ] concat_gct = cg.hstack(gcts, False, None, fields_to_remove=fields_to_remove) concat_gct_wo_meta = GCToo.GCToo( data_df=concat_gct.data_df, row_metadata_df=pd.DataFrame(index=concat_gct.data_df.index), col_metadata_df=pd.DataFrame(index=concat_gct.col_metadata_df.index)) logger.debug("gct shape without metadata: {}".format( concat_gct_wo_meta.data_df.shape)) wgx.write( concat_gct_wo_meta, outfile + 'n{}x{}'.format(concat_gct.data_df.shape[1], concat_gct.data_df.shape[0]) + file_suffix) return concat_gct, failure_list
def test_write_src(self): # case 1: gctoo obj doesn't have src mini1 = mini_gctoo_for_testing.make() mini1.src = None write_gctx.write(mini1, "no_src_example") hdf5_file = h5py.File("no_src_example.gctx") hdf5_src1 = hdf5_file.attrs[write_gctx.src_attr] hdf5_file.close() self.assertEqual(hdf5_src1, "no_src_example.gctx") os.remove("no_src_example.gctx") # case 2: gctoo obj does have src mini2 = mini_gctoo_for_testing.make() write_gctx.write(mini2, "with_src_example.gctx") hdf5_file = h5py.File("with_src_example.gctx") hdf5_src2 = hdf5_file.attrs[write_gctx.src_attr] hdf5_file.close() self.assertEqual(hdf5_src2, "mini_gctoo.gctx") os.remove("with_src_example.gctx")
def reduce_and_save(): """ Reads in the level 5 data and outputs a file with only the landmark gene z-scores(rows and the small molecule perterbagens (cols) """ ### Get the signature information sig_info = pd.read_csv(join(FILE_PATH, "GSE92742_Broad_LINCS_sig_info.txt"), sep="\t") ### Columns are: ### Index([u'sig_id', u'pert_id', u'pert_iname', u'pert_type', u'cell_id', ### u'pert_dose', u'pert_dose_unit', u'pert_idose', u'pert_time', ### u'pert_time_unit', u'pert_itime', u'distil_id'], ### dtype='object') ### Filter for signature ids for small molecule pertubagens small_mol_sigs = sig_info['sig_id'][sig_info['pert_type'] == "trt_cp"] ### Results in 205034 signatures ### Read in the gene info gene_info = pd.read_csv(join(FILE_PATH, "GSE92742_Broad_LINCS_gene_info.txt"), sep='\t') ### Index([u'pr_gene_id', u'pr_gene_symbol', u'pr_gene_title', u'pr_is_lm', ### u'pr_is_bing'], ### dtype='object') landmark_gene_ids = gene_info['pr_gene_id'][ gene_info['pr_is_lm'] == 1] #Filters for directly measured transcripts ### Results in the 978 landmark pr_gene_ids ### LOAD in the main file filtering the columns so that only the small molecules signatures are loaded and the ### rows such that only the landmark genes are loaded into their custom gctoo container type relevent_sigs_gctoo = parse(join( FILE_PATH, "GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx"), cid=small_mol_sigs, rid=landmark_gene_ids) # print small_mol_sigs.data_df.shape ### Should write an intermediate file with dimensions (978, 205034) write_gctx.write(relevent_sigs_gctoo, join(FILE_PATH, "lm_sm_aggz"))
def gct2gctx_main(args): """ Separate from main() in order to make command-line tool. """ in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False) if args.output_filepath is None: basename = os.path.basename(args.filename) out_name = os.path.splitext(basename)[0] + ".gctx" else: out_name = args.output_filepath """ If annotations are supplied, parse table and set metadata_df """ if args.row_annot_path is None: pass else: row_metadata = pd.read_csv(args.row_annot_path, sep='\t', index_col=0, header=0, low_memory=False) assert all(in_gctoo.data_df.index.isin(row_metadata.index)), \ "Row ids in matrix missing from annotations file" in_gctoo.row_metadata_df = row_metadata.loc[row_metadata.index.isin( in_gctoo.data_df.index)] if args.col_annot_path is None: pass else: col_metadata = pd.read_csv(args.col_annot_path, sep='\t', index_col=0, header=0, low_memory=False) assert all(in_gctoo.data_df.columns.isin(col_metadata.index)), \ "Column ids in matrix missing from annotations file" in_gctoo.col_metadata_df = col_metadata.loc[col_metadata.index.isin( in_gctoo.data_df.columns)] write_gctx.write(in_gctoo, out_name)
def test_write_version(self): # TODO @oana refactor this test so it just calls the write_version method # case 1: gctoo obj doesn't have version mini1 = mini_gctoo_for_testing.make() mini1.version = None fn = "no_version_provided_example.gctx" write_gctx.write(mini1, fn) hdf5_file = h5py.File(fn) hdf5_v1 = hdf5_file.attrs[write_gctx.version_attr] hdf5_file.close() self.assertEqual(hdf5_v1, write_gctx.version_number) os.remove(fn) # case 2: gctoo obj does have version, but it is not used when writing mini2 = mini_gctoo_for_testing.make() mini2.version = "MY_VERSION" fn = "with_version_provided_example.gctx" write_gctx.write(mini2, fn) hdf5_file = h5py.File(fn) hdf5_v2 = hdf5_file.attrs[write_gctx.version_attr] hdf5_file.close() self.assertEqual(hdf5_v2, write_gctx.version_number) os.remove(fn)
def test_write_gctx(self): out_name = os.path.join(FUNCTIONAL_TESTS_PATH, 'test_write_out_py2py3.gctx') gctoo = GCToo.GCToo(data_df=self.data_df, row_metadata_df=self.row_metadata_df, col_metadata_df=self.col_metadata_df) write_gctx.write(gctoo, out_name, convert_back_to_neg_666=True, gzip_compression_level=6, max_chunk_kb=1024, matrix_dtype=np.float32) # Read in the gct and verify that it's the same as gctoo # re-ininitalising gctooo because write_gctx is changing dtype of one column of col_metadata_df gctoo = GCToo.GCToo(data_df=self.data_df, row_metadata_df=self.row_metadata_df, col_metadata_df=self.col_metadata_df) new_gctx = parse_gctx.parse(out_name) pd.testing.assert_frame_equal(new_gctx.data_df, gctoo.data_df) pd.testing.assert_frame_equal(new_gctx.row_metadata_df, gctoo.row_metadata_df) pd.testing.assert_frame_equal(new_gctx.col_metadata_df, gctoo.col_metadata_df) # Cleanup os.remove(out_name)
def write_gctx(data, ofile): dataJON = {} # Create a numpy matrix bfpt = [data[x]['binary_fpt'] for x in range(len(data))] bfpta = numpy.transpose(numpy.array(bfpt, dtype='i8')) print(bfpta.shape) #bfpta = numpy.transpose(numpy.array(bfpt, dtype='bool')) # Create column desc dataJO = copy.deepcopy(data) #[dataJO[x].pop('pert_id') for x in range(len(dataJO))] #[dataJO[x].pop('binary_fpt') for x in range(len(dataJO))] #for dkey in dataJO[0].keys(): dataJON['pert_iname'] = [ dataJO[x]['pert_iname'] for x in range(len(dataJO)) ] # Create cid and rid cid = [data[x]['pert_id'] for x in range(bfpta.shape[1])] rid = ["bit" + str(x + 1) for x in range(bfpta.shape[0])] data_df = pd.DataFrame(bfpta, index=pd.Index(rid, name="rid"), columns=pd.Index(cid, name="cid")) #print(data_df.head()) # TOADD Column metadata df row_df = pd.DataFrame(index=rid) col_df = pd.DataFrame(index=cid) #print(row_df) #print(col_df) #gcto(bfpta, rid, cid, {}, dataJON) #gcto.write(ofile, 'gctx') gco = gctoo.GCToo(data_df=data_df, row_metadata_df=row_df, col_metadata_df=col_df) #wg.write(gco, ofile) wgx.write(gco, ofile)
def test_parse(self): # parse whole thing mg1 = mini_gctoo_for_testing.make() mg2 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx" ) pandas_testing.assert_frame_equal(mg1.data_df, mg2.data_df) pandas_testing.assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) pandas_testing.assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # test with string rid/cid test_rids = [ 'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666' ] test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] mg3 = subset_gctoo.subset_gctoo(mg1, rid=test_rids, cid=test_cids) mg4 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", rid=test_rids, cid=test_cids) pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df) pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) # first, make & write out temp version of mini_gctoo with int rids/cids new_mg = mini_gctoo_for_testing.make(convert_neg_666=False) int_indexed_data_df = new_mg.data_df.copy() int_indexed_data_df.index = [str(i) for i in range(0, 6)] int_indexed_data_df.columns = [str(i) for i in range(10, 16)] int_indexed_row_meta = new_mg.row_metadata_df.copy() int_indexed_row_meta.index = int_indexed_data_df.index int_indexed_col_meta = new_mg.col_metadata_df.copy() int_indexed_col_meta.index = int_indexed_data_df.columns int_indexed_gctoo = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx") # test with numeric (repr as string) rid/cid mg5 = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta, col_metadata_df=int_indexed_col_meta) mg5 = subset_gctoo.subset_gctoo( mg5, row_bool=[True, False, True, False, True, False], col_bool=[True, False, False, True, True, True]) mg5.data_df.index.name = "rid" mg5.data_df.columns.name = "cid" mg5.row_metadata_df.index.name = "rid" mg5.row_metadata_df.columns.name = "rhd" mg5.col_metadata_df.index.name = "cid" mg5.col_metadata_df.columns.name = "chd" mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid=["0", "2", "4"], cid=["10", "13", "14", "15"], convert_neg_666=False) os.remove("int_indexed_mini_gctoo.gctx") pandas_testing.assert_frame_equal(mg5.data_df, mg6.data_df) pandas_testing.assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) # test with ridx/cidx mg7 = subset_gctoo.subset_gctoo( mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) mg8 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) pandas_testing.assert_frame_equal(mg7.data_df, mg8.data_df) pandas_testing.assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df) pandas_testing.assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df) # test with rid/cidx mg9 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cidx=[4]) pandas_testing.assert_frame_equal(mg7.data_df, mg9.data_df) pandas_testing.assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df) pandas_testing.assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df) # test with ridx/cid mg10 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) pandas_testing.assert_frame_equal(mg7.data_df, mg10.data_df) pandas_testing.assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df) pandas_testing.assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df) # test with row_meta_only mg11 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", row_meta_only=True) pandas_testing.assert_frame_equal(mg11, mg1.row_metadata_df) # test with col_meta_only mg12 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", col_meta_only=True) pandas_testing.assert_frame_equal(mg12, mg1.col_metadata_df) # test with sort_row_meta False and ridx mg13 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", ) # test with sort_col_meta False and cidx mg13 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", cidx=[4, 1, 3], sort_col_meta=False) pandas_testing.assert_frame_equal(mg13.data_df, mg1.data_df.iloc[:, [4, 1, 3]]) pandas_testing.assert_frame_equal( mg13.col_metadata_df, mg1.col_metadata_df.iloc[[4, 1, 3], :]) pandas_testing.assert_frame_equal(mg13.row_metadata_df, mg1.row_metadata_df) # test with sort_row_meta False and ridx mg14 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", ridx=[3, 0, 1], sort_row_meta=False) pandas_testing.assert_frame_equal(mg14.data_df, mg1.data_df.iloc[[3, 0, 1], :]) pandas_testing.assert_frame_equal(mg14.col_metadata_df, mg1.col_metadata_df) pandas_testing.assert_frame_equal( mg14.row_metadata_df, mg1.row_metadata_df.iloc[[3, 0, 1], :]) # test with sort_col_meta False and cidx and col_meta_only mg15 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", cidx=[4, 1, 3], sort_col_meta=False, col_meta_only=True) pandas_testing.assert_frame_equal( mg15, mg1.col_metadata_df.iloc[[4, 1, 3], :]) # test with sort_row_meta False and ridx and row_meta_only mg16 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", ridx=[3, 0, 1], sort_row_meta=False, row_meta_only=True) pandas_testing.assert_frame_equal( mg16, mg1.row_metadata_df.iloc[[3, 0, 1], :]) # test with sort_col_meta False and cid cid_unsorted = [ 'LJP007_MCF7_24H:TRT_POSCON:BRD-K81418486:10', 'LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33' ] mg17 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests//mini_gctoo_for_testing.gctx", cid=cid_unsorted, sort_col_meta=False) pandas_testing.assert_frame_equal(mg17.data_df, mg1.data_df.iloc[:, [2, 0]]) pandas_testing.assert_frame_equal(mg17.col_metadata_df, mg1.col_metadata_df.iloc[[2, 0], :]) pandas_testing.assert_frame_equal(mg17.row_metadata_df, mg1.row_metadata_df) # test with sort_row_meta False and rid rid_unsorted = [ 'LJP007_MCF7_24H:TRT_CP:BRD-K64857848:10', 'MISC003_A375_24H:TRT_CP:BRD-K93918653:3.33' ] mg18 = parse_gctx.parse( "cmapPy/pandasGEXpress/tests/functional_tests/mini_gctoo_for_testing.gctx", rid=rid_unsorted, sort_row_meta=False) pandas_testing.assert_frame_equal(mg18.data_df, mg1.data_df.iloc[[5, 1], :]) pandas_testing.assert_frame_equal(mg18.col_metadata_df, mg1.col_metadata_df) pandas_testing.assert_frame_equal(mg18.row_metadata_df, mg1.row_metadata_df.iloc[[5, 1], :])
def test_parse(self): # parse whole thing mg1 = mini_gctoo_for_testing.make() mg2 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx") assert_frame_equal(mg1.data_df, mg2.data_df) assert_frame_equal(mg1.row_metadata_df, mg2.row_metadata_df) assert_frame_equal(mg1.col_metadata_df, mg2.col_metadata_df) # test with string rid/cid test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33','LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'] test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10'] mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids) mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid = test_rids, cid = test_cids) assert_frame_equal(mg3.data_df, mg4.data_df) assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df) assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df) # first, make & write out temp version of mini_gctoo with int rids/cids new_mg = mini_gctoo_for_testing.make(convert_neg_666=False) int_indexed_data_df = new_mg.data_df.copy() int_indexed_data_df.index = range(0,6) int_indexed_data_df.columns = range(10,16) int_indexed_row_meta = new_mg.row_metadata_df.copy() int_indexed_row_meta.index = range(0,6) int_indexed_col_meta = new_mg.col_metadata_df.copy() int_indexed_col_meta.index = range(10,16) int_indexed_gctoo = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta, col_metadata_df = int_indexed_col_meta) write_gctx.write(int_indexed_gctoo, "int_indexed_mini_gctoo.gctx") # test with numeric (repr as string) rid/cid mg5 = GCToo.GCToo(data_df = int_indexed_data_df, row_metadata_df = int_indexed_row_meta, col_metadata_df = int_indexed_col_meta) mg5 = slice_gct.slice_gctoo(mg5, row_bool = [True, False, True, False, True, False], col_bool = [True, False, False, True, True, True]) mg5.data_df.index.name = "rid" mg5.data_df.columns.name = "cid" mg5.row_metadata_df.index.name = "rid" mg5.row_metadata_df.columns.name = "rhd" mg5.col_metadata_df.index.name = "cid" mg5.col_metadata_df.columns.name = "chd" mg6 = parse_gctx.parse("int_indexed_mini_gctoo.gctx", rid = [0, 2, 4], cid = [10,13,14,15], convert_neg_666=False) os.remove("int_indexed_mini_gctoo.gctx") assert_frame_equal(mg5.data_df, mg6.data_df) assert_frame_equal(mg5.row_metadata_df, mg6.row_metadata_df) assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df) # test with ridx/cidx mg7 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666') mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4]) assert_frame_equal(mg7.data_df, mg8.data_df) assert_frame_equal(mg7.row_metadata_df, mg8.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg8.col_metadata_df) # test with rid/cidx mg9 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'], cidx = [4]) assert_frame_equal(mg7.data_df, mg9.data_df) assert_frame_equal(mg7.row_metadata_df, mg9.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg9.col_metadata_df) # test with ridx/cid mg10 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cid = ['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']) assert_frame_equal(mg7.data_df, mg10.data_df) assert_frame_equal(mg7.row_metadata_df, mg10.row_metadata_df) assert_frame_equal(mg7.col_metadata_df, mg10.col_metadata_df)
gene_path = 'GSE92742/GSE92742_Broad_LINCS_gene_info.txt' sig_path = 'GSE92742/GSE92742_Broad_LINCS_sig_info.txt' data_path = 'GSE92742/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx' genesymbol = raw_input("Enter Gene Symbol: ") geneid = raw_input("Enter Gene ID: ") #GET gene data from datamatrix if not os.path.exists("raw_%s.gctx" % genesymbol): print "Extract Gene Raw Data... " gene_info = pd.read_csv(gene_path, sep='\t', dtype=str) landmark_gene_row_ids = gene_info["pr_gene_id"][gene_info["pr_gene_symbol"] == genesymbol] landmark_only_gctoo = parse(data_path, rid=landmark_gene_row_ids) wg.write(landmark_only_gctoo, "raw_%s.gctx" % genesymbol) print "Finished!" print "Analysing Data... " #GET sig info sig_info = pd.read_csv(sig_path, sep='\t', dtype=str) itemlist = [ u'sig_id', u'pert_id', u'pert_iname', u'pert_type', u'cell_id', u'pert_idose', u'pert_itime' ] annolist = np.vstack([sig_info[item].values for item in itemlist]) #print sig_info.columns #print annolist #print np.unique(sig_info[u'pert_type'].values,return_counts = True) index = dict([[annolist[0, i], annolist[:, i]] for i in range(len(annolist[0, :]))])
from cmapPy.pandasGEXpress.parse import parse from cmapPy.pandasGEXpress.write_gctx import write from cmapPy.pandasGEXpress.GCToo import GCToo print("Load data") dset_full = parse( "C:/users/jdr2160/venomseq_data/annotated_GSE92742_Broad_LINCS_Level5_COMPZ_n473647x12328.gctx" ) cm_file = "C:/data/out_cm.h5" rm_file = "C:/data/out_rm.h5" cm_full = pd.read_hdf(cm_file) rm_full = pd.read_hdf(rm_file) print("Set mask") mask = cm_full.pert_type.isin(['trt_cp', 'ctl_untrt']) print("Subset data") df_sub = dset_full.data_df.iloc[:, np.array(mask)] cm_sub = cm_full.iloc[np.array(mask), :] df_sub.columns = [c[2:-1] for c in df_sub.columns] df_sub.index = [r[2:-1] for r in df_sub.index] print("Write to disk") out_gctx = GCToo(data_df=df_sub, row_metadata_df=rm_full, col_metadata_df=cm_sub) write(out_gctx, "C:/data/GSE92742_cps_level5.gctx")
row_spaces = [978, 10174] for c in col_spaces: for r in row_spaces: curr_gctoo = sg.subset_gctoo(big_gctoo, ridx=range(0, r), cidx=range(0, c)) # gct writing out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gct" start = time.clock() write_gct.write(curr_gctoo, out_fname) end = time.clock() elapsed_time = end - start gct_times[out_fname] = elapsed_time os.remove(out_fname) # gctx writing out_fname = "write_test_n" + str(c) + "x" + str(r) + ".gctx" start = time.clock() write_gctx.write(curr_gctoo, out_fname) end = time.clock() elapsed_time = end - start gctx_times[out_fname] = elapsed_time os.remove(out_fname) # write results to file gct_df = pd.DataFrame(pd.Series(gct_times)) gctx_df = pd.DataFrame(pd.Series(gctx_times)) write_times_df = pd.concat([gct_df, gctx_df]) write_times_df.columns = ["write_time"] write_times_df.to_csv("python_writing_results.txt", sep="\t")
#idx_both= intersect(idx_idose,idx_itime) #if( len(idx_both) != 0): # ctl_ids = sig_info_cell_vehicle["sig_id"][idx_both] #elif( len (idx_idose) != 0): # ctl_ids = sig_info_cell_vehicle["sig_id"][idx_idose] #else: ctl_ids = sig_info_cell_vehicle["sig_id"] tot_ids = ctl_ids.tolist() + pert_ids.tolist() print("cell = %s, pert_id= %s \nN_pert=%d, N_ctl=%d" % (cell, "nothing", len(pert_ids), len(ctl_ids))) data = parse( "../Data/GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", cid=tot_ids) wg.write(data, "Subset_GCT/GSE92742_Level5_%s_%d" % (cell, i)) count = 0 for pert_id in pert_ids_names: for dose in dose_cp_list: for time in time_cp_list: pert_ids_cls = sig_info_cell_cp["sig_id"][ (sig_info_cell_cp["pert_id"] == pert_id) & (sig_info_cell_cp["pert_idose"] == dose) & (sig_info_cell_cp["pert_itime"] == time)] if (len(pert_ids_cls) != 0): ctl_ids_cls = sig_info_cell_vehicle["sig_id"][ (sig_info_cell_vehicle["pert_id"] == pert_id) & (sig_info_cell_vehicle["pert_idose"] == dose) & (sig_info_cell_vehicle["pert_itime"] == time)]