def test_get_data(self): """ Test get_data. """ config = {'target': '1296534', 'r2': 'Fit_R2', 'phenotype': 'in'} engine = PcbaDataExtractor(self.aid998, config, with_aid=True) # check SID->CID mapping data = engine.get_data() assert 'cid' not in data.columns data = engine.get_data(sid_cid={11110959: 1730}) assert data[data['sid'] == 11110959].iloc[0]['cid'] == 1730 assert pd.isnull(data[data['sid'] == 11111313].iloc[0]['cid']) # check lowercase data = engine.get_data(lower=True) assert data[data['sid'] == 11110959].iloc[0]['phenotype'] == 'inhibitor' assert data[data['sid'] == 11110959].iloc[0]['potency'] == 4.4668 data = engine.get_data(lower=False) assert data[data['sid'] == 11110959].iloc[0]['phenotype'] == 'Inhibitor' assert data[data['sid'] == 11110959].iloc[0]['potency'] == 4.4668
def test_check_config(self): """ Test _check_config. """ config = { 'target': '1296534', 'potency': 'Activity at 10 uM', 'phenotype': 'in' } # without common fields engine = PcbaDataExtractor(self.aid540325, config, with_aid=True) assert engine.config['target'] == 'gi_1296534' assert 'phenotype' not in engine.config assert engine.phenotype == 'inhibitor' # with common fields engine = PcbaDataExtractor(self.aid998, config, with_aid=True) assert engine.config['target'] == 'gi_1296534' assert 'phenotype' in engine.config assert engine.config['phenotype'] == 'Phenotype' assert engine.config['potency'] == 'Potency' assert engine.config['efficacy'] == 'Efficacy'
def main(dirs, config_filename, map_filename=None, summary_filename=None, with_aid=True, with_target=True, phenotype=False, id_prefix='CID', output_format='.pkl.gz'): aids = set() targets = set() total = 0 config = pd.read_csv(config_filename) summary = [] sid_cid = None if map_filename is not None: sid_cid = read_sid_cid_map(map_filename) if 'aid' not in config.columns: raise ValueError('Configuration file must contain "aid" column.') assert len(config) == len(pd.unique(config['aid'])) for this_dir in dirs: for filename in glob.glob(os.path.join(this_dir, '*.json.gz')): # get AID from filename so we only have to load relevant assays aid = int(os.path.basename(filename).split('.')[0]) if aid not in config['aid'].values: continue # get configuration for this AID this_config = config[config['aid'] == aid].iloc[0] if not with_aid and 'aid' in this_config: del this_config['aid'] if not with_target and 'target' in this_config: del this_config['target'] # get data try: extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid) except NotImplementedError as e: warnings.warn(e.message) continue if phenotype and 'phenotype' not in extractor.config: warnings.warn('{} has no phenotype'.format(aid)) continue assert aid == extractor.parser.get_aid() # sanity check for AID match aids.add(aid) target = extractor.config.get('target') targets.add(target) data = extractor.get_data(sid_cid=sid_cid) total += len(data) # add generic molecule ID column if id_prefix == 'CID': col = 'cid' elif id_prefix == 'SID': col = 'sid' else: raise NotImplementedError('Unrecognized ID prefix "{}"'.format( id_prefix)) ids = [] for i, mol_id in enumerate(data[col]): try: ids.append(id_prefix + str(int(mol_id))) except (TypeError, ValueError): warnings.warn('No ID for the following row:\n{}'.format(data.loc[i])) ids.append(None) # can be found with pd.isnull # skip this assay if there are no valid IDs if np.all(pd.isnull(ids)): warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid)) continue data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index) # add generic assay ID column assay_id = 'PCBA-' + str(aid) if with_aid: data.loc[:, 'assay_id'] = assay_id # save dataframe output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data)) write_dataframe(data, output_filename) summary.append({'aid': aid, 'target': target, 'filename': output_filename, 'size': len(data)}) # make sure we found everything missing = set(config['aid']).difference(aids) if len(missing): warnings.warn('Missed AIDs {}'.format(missing)) # save a summary summary = pd.DataFrame(summary) if summary_filename is not None: write_dataframe(summary, summary_filename) warnings.warn('Found {} assays for {} targets ({} total data points)'.format( len(aids), len(targets), total))
def main(dirs, config_filename, map_filename=None, summary_filename=None, with_aid=True, with_target=True, phenotype=False, id_prefix='CID', output_format='.pkl.gz'): aids = set() targets = set() total = 0 config = pd.read_csv(config_filename) summary = [] sid_cid = None if map_filename is not None: sid_cid = read_sid_cid_map(map_filename) if 'aid' not in config.columns: raise ValueError('Configuration file must contain "aid" column.') assert len(config) == len(pd.unique(config['aid'])) for this_dir in dirs: for filename in glob.glob(os.path.join(this_dir, '*.json.gz')): # get AID from filename so we only have to load relevant assays aid = int(os.path.basename(filename).split('.')[0]) if aid not in config['aid'].values: continue # get configuration for this AID this_config = config[config['aid'] == aid].iloc[0] if not with_aid and 'aid' in this_config: del this_config['aid'] if not with_target and 'target' in this_config: del this_config['target'] # get data try: extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid) except NotImplementedError as e: warnings.warn(e.message) continue if phenotype and 'phenotype' not in extractor.config: warnings.warn('{} has no phenotype'.format(aid)) continue assert aid == extractor.parser.get_aid( ) # sanity check for AID match aids.add(aid) target = extractor.config.get('target') targets.add(target) data = extractor.get_data(sid_cid=sid_cid) total += len(data) # add generic molecule ID column if id_prefix == 'CID': col = 'cid' elif id_prefix == 'SID': col = 'sid' else: raise NotImplementedError( 'Unrecognized ID prefix "{}"'.format(id_prefix)) ids = [] for i, mol_id in enumerate(data[col]): try: ids.append(id_prefix + str(int(mol_id))) except (TypeError, ValueError): warnings.warn('No ID for the following row:\n{}'.format( data.loc[i])) ids.append(None) # can be found with pd.isnull # skip this assay if there are no valid IDs if np.all(pd.isnull(ids)): warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid)) continue data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index) # add generic assay ID column assay_id = 'PCBA-' + str(aid) if with_aid: data.loc[:, 'assay_id'] = assay_id # save dataframe output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data)) write_dataframe(data, output_filename) summary.append({ 'aid': aid, 'target': target, 'filename': output_filename, 'size': len(data) }) # make sure we found everything missing = set(config['aid']).difference(aids) if len(missing): warnings.warn('Missed AIDs {}'.format(missing)) # save a summary summary = pd.DataFrame(summary) if summary_filename is not None: write_dataframe(summary, summary_filename) warnings.warn( 'Found {} assays for {} targets ({} total data points)'.format( len(aids), len(targets), total))