Ejemplo n.º 1
0
def test_read_sid_cid_map():
    """
  Test read_sid_cid_map.
  """
    f, filename = tempfile.mkstemp(suffix='.txt')
    os.close(f)
    g, gilename = tempfile.mkstemp(suffix='.txt.gz')
    os.close(g)
    try:
        with open(filename, 'wb') as f:
            f.write('123456\t7890\n')
        with gzip.open(gilename, 'wb') as g:
            g.write('123456\t7890\n')
        assert read_sid_cid_map(filename) == {123456: 7890}
        assert read_sid_cid_map(gilename) == {123456: 7890}
    finally:
        os.remove(filename)
        os.remove(gilename)
Ejemplo n.º 2
0
def test_read_sid_cid_map():
  """
  Test read_sid_cid_map.
  """
  f, filename = tempfile.mkstemp(suffix='.txt')
  os.close(f)
  g, gilename = tempfile.mkstemp(suffix='.txt.gz')
  os.close(g)
  try:
    with open(filename, 'wb') as f:
      f.write('123456\t7890\n')
    with gzip.open(gilename, 'wb') as g:
      g.write('123456\t7890\n')
    assert read_sid_cid_map(filename) == {123456: 7890}
    assert read_sid_cid_map(gilename) == {123456: 7890}
  finally:
    os.remove(filename)
    os.remove(gilename)
Ejemplo n.º 3
0
def main(dirs, config_filename, map_filename=None, summary_filename=None,
         with_aid=True, with_target=True, phenotype=False, id_prefix='CID',
         output_format='.pkl.gz'):
  aids = set()
  targets = set()
  total = 0
  config = pd.read_csv(config_filename)
  summary = []
  sid_cid = None
  if map_filename is not None:
    sid_cid = read_sid_cid_map(map_filename)
  if 'aid' not in config.columns:
    raise ValueError('Configuration file must contain "aid" column.')
  assert len(config) == len(pd.unique(config['aid']))
  for this_dir in dirs:
    for filename in glob.glob(os.path.join(this_dir, '*.json.gz')):

      # get AID from filename so we only have to load relevant assays
      aid = int(os.path.basename(filename).split('.')[0])
      if aid not in config['aid'].values:
        continue

      # get configuration for this AID
      this_config = config[config['aid'] == aid].iloc[0]
      if not with_aid and 'aid' in this_config:
        del this_config['aid']
      if not with_target and 'target' in this_config:
        del this_config['target']

      # get data
      try:
        extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid)
      except NotImplementedError as e:
        warnings.warn(e.message)
        continue
      if phenotype and 'phenotype' not in extractor.config:
        warnings.warn('{} has no phenotype'.format(aid))
        continue
      assert aid == extractor.parser.get_aid()  # sanity check for AID match
      aids.add(aid)
      target = extractor.config.get('target')
      targets.add(target)
      data = extractor.get_data(sid_cid=sid_cid)
      total += len(data)

      # add generic molecule ID column
      if id_prefix == 'CID':
        col = 'cid'
      elif id_prefix == 'SID':
        col = 'sid'
      else:
        raise NotImplementedError('Unrecognized ID prefix "{}"'.format(
            id_prefix))
      ids = []
      for i, mol_id in enumerate(data[col]):
        try:
          ids.append(id_prefix + str(int(mol_id)))
        except (TypeError, ValueError):
          warnings.warn('No ID for the following row:\n{}'.format(data.loc[i]))
          ids.append(None)  # can be found with pd.isnull

      # skip this assay if there are no valid IDs
      if np.all(pd.isnull(ids)):
        warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid))
        continue
      data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index)

      # add generic assay ID column
      assay_id = 'PCBA-' + str(aid)
      if with_aid:
        data.loc[:, 'assay_id'] = assay_id

      # save dataframe
      output_filename = '{}.{}'.format(assay_id, output_format)
      print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data))
      write_dataframe(data, output_filename)
      summary.append({'aid': aid, 'target': target,
                      'filename': output_filename, 'size': len(data)})

  # make sure we found everything
  missing = set(config['aid']).difference(aids)
  if len(missing):
    warnings.warn('Missed AIDs {}'.format(missing))

  # save a summary
  summary = pd.DataFrame(summary)
  if summary_filename is not None:
    write_dataframe(summary, summary_filename)
  warnings.warn('Found {} assays for {} targets ({} total data points)'.format(
      len(aids), len(targets), total))
Ejemplo n.º 4
0
def main(dirs,
         config_filename,
         map_filename=None,
         summary_filename=None,
         with_aid=True,
         with_target=True,
         phenotype=False,
         id_prefix='CID',
         output_format='.pkl.gz'):
    aids = set()
    targets = set()
    total = 0
    config = pd.read_csv(config_filename)
    summary = []
    sid_cid = None
    if map_filename is not None:
        sid_cid = read_sid_cid_map(map_filename)
    if 'aid' not in config.columns:
        raise ValueError('Configuration file must contain "aid" column.')
    assert len(config) == len(pd.unique(config['aid']))
    for this_dir in dirs:
        for filename in glob.glob(os.path.join(this_dir, '*.json.gz')):

            # get AID from filename so we only have to load relevant assays
            aid = int(os.path.basename(filename).split('.')[0])
            if aid not in config['aid'].values:
                continue

            # get configuration for this AID
            this_config = config[config['aid'] == aid].iloc[0]
            if not with_aid and 'aid' in this_config:
                del this_config['aid']
            if not with_target and 'target' in this_config:
                del this_config['target']

            # get data
            try:
                extractor = PcbaDataExtractor(filename,
                                              this_config,
                                              with_aid=with_aid)
            except NotImplementedError as e:
                warnings.warn(e.message)
                continue
            if phenotype and 'phenotype' not in extractor.config:
                warnings.warn('{} has no phenotype'.format(aid))
                continue
            assert aid == extractor.parser.get_aid(
            )  # sanity check for AID match
            aids.add(aid)
            target = extractor.config.get('target')
            targets.add(target)
            data = extractor.get_data(sid_cid=sid_cid)
            total += len(data)

            # add generic molecule ID column
            if id_prefix == 'CID':
                col = 'cid'
            elif id_prefix == 'SID':
                col = 'sid'
            else:
                raise NotImplementedError(
                    'Unrecognized ID prefix "{}"'.format(id_prefix))
            ids = []
            for i, mol_id in enumerate(data[col]):
                try:
                    ids.append(id_prefix + str(int(mol_id)))
                except (TypeError, ValueError):
                    warnings.warn('No ID for the following row:\n{}'.format(
                        data.loc[i]))
                    ids.append(None)  # can be found with pd.isnull

            # skip this assay if there are no valid IDs
            if np.all(pd.isnull(ids)):
                warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid))
                continue
            data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index)

            # add generic assay ID column
            assay_id = 'PCBA-' + str(aid)
            if with_aid:
                data.loc[:, 'assay_id'] = assay_id

            # save dataframe
            output_filename = '{}.{}'.format(assay_id, output_format)
            print '{}\t{}\t{}\t{}'.format(aid, target, output_filename,
                                          len(data))
            write_dataframe(data, output_filename)
            summary.append({
                'aid': aid,
                'target': target,
                'filename': output_filename,
                'size': len(data)
            })

    # make sure we found everything
    missing = set(config['aid']).difference(aids)
    if len(missing):
        warnings.warn('Missed AIDs {}'.format(missing))

    # save a summary
    summary = pd.DataFrame(summary)
    if summary_filename is not None:
        write_dataframe(summary, summary_filename)
    warnings.warn(
        'Found {} assays for {} targets ({} total data points)'.format(
            len(aids), len(targets), total))