def main(input_filename, map_filename, output_filename, column_indices=None):
    """
    Get regression targets.

    Parameters
    ----------
    input_filename : str
        PCBA data filename.
    map_filename : str
        ID->SMILES map filename.
    output_filename : str
        Output filename.
    column_indices : list, optional
        Column indices to include. If None, compounds are classified by
        activity.
    """
    parser = PcbaParser(input_filename, map_filename,
                        column_indices=column_indices)
    if column_indices is not None:
        print "Extracting data from the following columns:"
        for col in parser.get_column_names():
            print '\t', col
    smiles, targets = parser.get_targets()

    # print the fraction of valid assay records that were found in the map
    total = np.count_nonzero(~np.isnan(parser.read_data().PUBCHEM_CID))
    print '{}/{} records matched'.format(len(targets), total)

    # save SMILES and targets
    write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
 def test_read_data(self):
     """
     Test Nci60Parser.read_data.
     """
     df = self.engine.read_data()
     fixed_count = df.count().values.sum()  # count excluding NaNs
     # use PcbaParser to read data (w/o proper NaN handling)
     engine = PcbaParser(self.data_filename, self.map_filename, delimiter="\t", primary_key="NSC", id_prefix="NSC")
     df = engine.read_data()
     broken_count = df.count().values.sum()
     assert fixed_count < broken_count
 def test_read_data(self):
     """
     Test Nci60Parser.read_data.
     """
     df = self.engine.read_data()
     fixed_count = df.count().values.sum()  # count excluding NaNs
     # use PcbaParser to read data (w/o proper NaN handling)
     engine = PcbaParser(self.data_filename,
                         self.map_filename,
                         delimiter='\t',
                         primary_key='NSC',
                         id_prefix='NSC')
     df = engine.read_data()
     broken_count = df.count().values.sum()
     assert fixed_count < broken_count