def main(input_filename, map_filename, output_filename, column_indices=None): """ Get regression targets. Parameters ---------- input_filename : str PCBA data filename. map_filename : str ID->SMILES map filename. output_filename : str Output filename. column_indices : list, optional Column indices to include. If None, compounds are classified by activity. """ parser = PcbaParser(input_filename, map_filename, column_indices=column_indices) if column_indices is not None: print "Extracting data from the following columns:" for col in parser.get_column_names(): print '\t', col smiles, targets = parser.get_targets() # print the fraction of valid assay records that were found in the map total = np.count_nonzero(~np.isnan(parser.read_data().PUBCHEM_CID)) print '{}/{} records matched'.format(len(targets), total) # save SMILES and targets write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
def test_read_data(self): """ Test Nci60Parser.read_data. """ df = self.engine.read_data() fixed_count = df.count().values.sum() # count excluding NaNs # use PcbaParser to read data (w/o proper NaN handling) engine = PcbaParser(self.data_filename, self.map_filename, delimiter="\t", primary_key="NSC", id_prefix="NSC") df = engine.read_data() broken_count = df.count().values.sum() assert fixed_count < broken_count
def test_read_data(self): """ Test Nci60Parser.read_data. """ df = self.engine.read_data() fixed_count = df.count().values.sum() # count excluding NaNs # use PcbaParser to read data (w/o proper NaN handling) engine = PcbaParser(self.data_filename, self.map_filename, delimiter='\t', primary_key='NSC', id_prefix='NSC') df = engine.read_data() broken_count = df.count().values.sum() assert fixed_count < broken_count