Ejemplo n.º 1
0
def main(active_filename, decoy_filename, assay_id, target, with_assay_id=True,
         with_target=True, phenotype=None, output_filename=None,
         mol_id_prefix=None, output_format='.pkl.gz'):
  rows = []
  for outcome, filename in zip(['active', 'inactive'],
                               [active_filename, decoy_filename]):
    this_phenotype = phenotype
    if outcome == 'inactive' and phenotype is not None:
      this_phenotype = 'inactive'
    with serial.MolReader().open(filename) as reader:
      this_rows = get_rows(reader, outcome, this_phenotype, mol_id_prefix)
      rows.extend(this_rows)

  # create dataframe
  df = pd.DataFrame(rows)

  # sanity check for duplicate mol_ids
  assert len(np.unique(df['mol_id'])) == len(df)

  # add assay_id and target columns
  if with_assay_id:
    df.loc[:, 'assay_id'] = assay_id
  if with_target:
    df.loc[:, 'target'] = target

  if output_filename is None:
    output_filename = '{}.{}'.format(assay_id, output_format)
  print '{}\t{}\t{}\t{}'.format(assay_id, target, output_filename, len(df))
  write_dataframe(df, output_filename)
Ejemplo n.º 2
0
def write_output_file(data, filename, compression_level=3):
    """
    Pickle output data, possibly to a compressed file.

    Parameters
    ----------
    data : object
        Object to pickle in output file.
    filename : str
        Output filename. Should end with .joblib, .pkl, or .pkl.gz.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    """
    if filename.endswith('.joblib'):
        joblib.dump(data, filename, compress=compression_level)
    else:
        write_dataframe(data, filename)
Ejemplo n.º 3
0
def write_output_file(data, filename, compression_level=3):
    """
    Pickle output data, possibly to a compressed file.

    Parameters
    ----------
    data : object
        Object to pickle in output file.
    filename : str
        Output filename. Should end with .joblib, .pkl, or .pkl.gz.
    compression_level : int, optional (default 3)
        Compression level (0-9) to use with joblib.dump.
    """
    if filename.endswith('.joblib'):
      joblib.dump(data, filename, compress=compression_level)
    else:
      write_dataframe(data, filename)
Ejemplo n.º 4
0
def main(active_filename,
         decoy_filename,
         assay_id,
         target,
         with_assay_id=True,
         with_target=True,
         phenotype=None,
         output_filename=None,
         mol_id_prefix=None,
         output_format='.pkl.gz'):
    rows = []
    for outcome, filename in zip(['active', 'inactive'],
                                 [active_filename, decoy_filename]):
        this_phenotype = phenotype
        if outcome == 'inactive' and phenotype is not None:
            this_phenotype = 'inactive'
        with serial.MolReader().open(filename) as reader:
            this_rows = get_rows(reader, outcome, this_phenotype,
                                 mol_id_prefix)
            rows.extend(this_rows)

    # create dataframe
    df = pd.DataFrame(rows)

    # sanity check for duplicate mol_ids
    assert len(np.unique(df['mol_id'])) == len(df)

    # add assay_id and target columns
    if with_assay_id:
        df.loc[:, 'assay_id'] = assay_id
    if with_target:
        df.loc[:, 'target'] = target

    if output_filename is None:
        output_filename = '{}.{}'.format(assay_id, output_format)
    print '{}\t{}\t{}\t{}'.format(assay_id, target, output_filename, len(df))
    write_dataframe(df, output_filename)
Ejemplo n.º 5
0
def main(dirs, config_filename, map_filename=None, summary_filename=None,
         with_aid=True, with_target=True, phenotype=False, id_prefix='CID',
         output_format='.pkl.gz'):
  aids = set()
  targets = set()
  total = 0
  config = pd.read_csv(config_filename)
  summary = []
  sid_cid = None
  if map_filename is not None:
    sid_cid = read_sid_cid_map(map_filename)
  if 'aid' not in config.columns:
    raise ValueError('Configuration file must contain "aid" column.')
  assert len(config) == len(pd.unique(config['aid']))
  for this_dir in dirs:
    for filename in glob.glob(os.path.join(this_dir, '*.json.gz')):

      # get AID from filename so we only have to load relevant assays
      aid = int(os.path.basename(filename).split('.')[0])
      if aid not in config['aid'].values:
        continue

      # get configuration for this AID
      this_config = config[config['aid'] == aid].iloc[0]
      if not with_aid and 'aid' in this_config:
        del this_config['aid']
      if not with_target and 'target' in this_config:
        del this_config['target']

      # get data
      try:
        extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid)
      except NotImplementedError as e:
        warnings.warn(e.message)
        continue
      if phenotype and 'phenotype' not in extractor.config:
        warnings.warn('{} has no phenotype'.format(aid))
        continue
      assert aid == extractor.parser.get_aid()  # sanity check for AID match
      aids.add(aid)
      target = extractor.config.get('target')
      targets.add(target)
      data = extractor.get_data(sid_cid=sid_cid)
      total += len(data)

      # add generic molecule ID column
      if id_prefix == 'CID':
        col = 'cid'
      elif id_prefix == 'SID':
        col = 'sid'
      else:
        raise NotImplementedError('Unrecognized ID prefix "{}"'.format(
            id_prefix))
      ids = []
      for i, mol_id in enumerate(data[col]):
        try:
          ids.append(id_prefix + str(int(mol_id)))
        except (TypeError, ValueError):
          warnings.warn('No ID for the following row:\n{}'.format(data.loc[i]))
          ids.append(None)  # can be found with pd.isnull

      # skip this assay if there are no valid IDs
      if np.all(pd.isnull(ids)):
        warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid))
        continue
      data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index)

      # add generic assay ID column
      assay_id = 'PCBA-' + str(aid)
      if with_aid:
        data.loc[:, 'assay_id'] = assay_id

      # save dataframe
      output_filename = '{}.{}'.format(assay_id, output_format)
      print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data))
      write_dataframe(data, output_filename)
      summary.append({'aid': aid, 'target': target,
                      'filename': output_filename, 'size': len(data)})

  # make sure we found everything
  missing = set(config['aid']).difference(aids)
  if len(missing):
    warnings.warn('Missed AIDs {}'.format(missing))

  # save a summary
  summary = pd.DataFrame(summary)
  if summary_filename is not None:
    write_dataframe(summary, summary_filename)
  warnings.warn('Found {} assays for {} targets ({} total data points)'.format(
      len(aids), len(targets), total))
Ejemplo n.º 6
0
def main(dirs,
         config_filename,
         map_filename=None,
         summary_filename=None,
         with_aid=True,
         with_target=True,
         phenotype=False,
         id_prefix='CID',
         output_format='.pkl.gz'):
    aids = set()
    targets = set()
    total = 0
    config = pd.read_csv(config_filename)
    summary = []
    sid_cid = None
    if map_filename is not None:
        sid_cid = read_sid_cid_map(map_filename)
    if 'aid' not in config.columns:
        raise ValueError('Configuration file must contain "aid" column.')
    assert len(config) == len(pd.unique(config['aid']))
    for this_dir in dirs:
        for filename in glob.glob(os.path.join(this_dir, '*.json.gz')):

            # get AID from filename so we only have to load relevant assays
            aid = int(os.path.basename(filename).split('.')[0])
            if aid not in config['aid'].values:
                continue

            # get configuration for this AID
            this_config = config[config['aid'] == aid].iloc[0]
            if not with_aid and 'aid' in this_config:
                del this_config['aid']
            if not with_target and 'target' in this_config:
                del this_config['target']

            # get data
            try:
                extractor = PcbaDataExtractor(filename,
                                              this_config,
                                              with_aid=with_aid)
            except NotImplementedError as e:
                warnings.warn(e.message)
                continue
            if phenotype and 'phenotype' not in extractor.config:
                warnings.warn('{} has no phenotype'.format(aid))
                continue
            assert aid == extractor.parser.get_aid(
            )  # sanity check for AID match
            aids.add(aid)
            target = extractor.config.get('target')
            targets.add(target)
            data = extractor.get_data(sid_cid=sid_cid)
            total += len(data)

            # add generic molecule ID column
            if id_prefix == 'CID':
                col = 'cid'
            elif id_prefix == 'SID':
                col = 'sid'
            else:
                raise NotImplementedError(
                    'Unrecognized ID prefix "{}"'.format(id_prefix))
            ids = []
            for i, mol_id in enumerate(data[col]):
                try:
                    ids.append(id_prefix + str(int(mol_id)))
                except (TypeError, ValueError):
                    warnings.warn('No ID for the following row:\n{}'.format(
                        data.loc[i]))
                    ids.append(None)  # can be found with pd.isnull

            # skip this assay if there are no valid IDs
            if np.all(pd.isnull(ids)):
                warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid))
                continue
            data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index)

            # add generic assay ID column
            assay_id = 'PCBA-' + str(aid)
            if with_aid:
                data.loc[:, 'assay_id'] = assay_id

            # save dataframe
            output_filename = '{}.{}'.format(assay_id, output_format)
            print '{}\t{}\t{}\t{}'.format(aid, target, output_filename,
                                          len(data))
            write_dataframe(data, output_filename)
            summary.append({
                'aid': aid,
                'target': target,
                'filename': output_filename,
                'size': len(data)
            })

    # make sure we found everything
    missing = set(config['aid']).difference(aids)
    if len(missing):
        warnings.warn('Missed AIDs {}'.format(missing))

    # save a summary
    summary = pd.DataFrame(summary)
    if summary_filename is not None:
        write_dataframe(summary, summary_filename)
    warnings.warn(
        'Found {} assays for {} targets ({} total data points)'.format(
            len(aids), len(targets), total))