Esempio n. 1
0
def tokenize_extra(ds, store_path):
    """
  Additional feature extraction for features that are not provided by the dataset
  implementation.

  @param ds dataset to read from
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== tokenize_extra for {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerlemmapos'
        proxy.tokenize(ext.bigram)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.trigram)

    # Hackish workaround for store being unexpectedly closed
    with closing(Store(store_path, 'a', recursive_close=False)) as store:
        proxy = DataProxy(ds, store=store)

        proxy.tokenstream_name = 'treetaggerpos'
        proxy.tokenize(ext.bigram)
Esempio n. 2
0
    def classify_batch(self, abstracts):
        """
    Only batch-mode classify is supported due to the underlying operations.
    Single-item classification is expected to be slow.
    @param abstracts a mapping from docid to a list of lines
    """
        self.__unpack()
        ts = {}
        for filename, lines in abstracts.iteritems():
            for i, line in enumerate(lines):
                docid = "{0}-{1}".format(filename, i + 1)
                ts[docid] = line

        try:
            handle, store_path = tempfile.mkstemp(dir=self.tempdir)
            os.close(handle)
            logger.debug("temporary store at {}".format(store_path))

            with Timer() as feat_timer:
                induce(ts, store_path, self.model.features, self.model.spaces)
                logger.info("computing features took {0:.2f}s".format(feat_timer.elapsed))

            store = Store(store_path, "r")

            with Timer() as cl_timer:
                L0_preds = []
                for feat, cl in zip(self.model.features, self.model.L0_cl):
                    fm = store.get_FeatureMap("NewDocuments", feat)
                    # We need to trim the fv as the feature space may have grown when we tokenized more documents.
                    # Hydrat's design is such that new features are appended to the end of a feature space, so
                    # we can safely truncate the feature map.
                    train_feat_count = cl.metadata["train_feat_count"]
                    assert train_feat_count <= fm.raw.shape[1]
                    L0_preds.append(cl(fm.raw[:, :train_feat_count]))

                L0_preds = sp.csr_matrix(np.hstack(L0_preds))
                L1_preds = self.model.L1_cl(L0_preds)

                logger.info(
                    "classification took {0:.2f}s ({1:.2f} inst/s)".format(
                        cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0])
                    )
                )

            cl_space = store.get_Space("ebmcat")
            instance_ids = store.get_Space("NewDocuments")
        finally:
            logger.debug("unlinking {}".format(store_path))
            os.unlink(store_path)

        return PIBOSOOutput(instance_ids, cl_space, L1_preds)
Esempio n. 3
0
def induce(ts, store_path, features, spaces):
    """
  Induce features for a list of abstracts.

  @param ts TokenStream (mapping from id to line)
  """
    ds = NewDocuments(ts)

    # Merge feature spaces into the store
    with closing(Store(store_path, 'a')) as store:
        for space in spaces:
            md = {'name': space, 'type': 'feature'}
            store.add_Space(spaces[space], md)

    # We do the feature induction in a subprocess to avoid Python holding on to memory.
    for feature in features:
        tokenize(ds, [feature], store_path)
        #p = mp.Process(target=tokenize, args=(ds, [feature], store_path))
        #p.start()
        #p.join()
        #p.terminate()

    tokenize_extra(ds, store_path)
Esempio n. 4
0
def tokenize(ds, features, store_path):
    """
  Compute feature values and save them in a hydrat store.

  @param ds dataset to read from
  @param features names of features to read
  @param store_path path of store to write to
  """
    class_space = 'ebmcat'

    #print >>sys.stderr,  "=== opening store at {0} ===".format(store_path)
    with closing(Store(store_path, 'a', recursive_close=False)) as store:

        #print >>sys.stderr,  "=== inducing features ({0}) ===".format(features)
        # Induce all the features for the new test data
        proxy = DataProxy(ds, store=store)
        proxy.inducer.process(
            proxy.dataset,
            fms=features,
            sqs=[
                'abstract',
            ],
        )
Esempio n. 5
0
import numpy as np
import scipy.sparse as sp

from common import Timer

if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument("model", help="read model from")
  parser.add_argument("data", help="store containing pre-tokenized data")
  parser.add_argument("feat", help="store containing feature data")
  parser.add_argument("output", help="write output to PATH", metavar="PATH")
  args = parser.parse_args()

  features, L0_cl, L1_cl = load(open(args.model))
  fallback = Store(args.feat, 'r')
  store = Store(args.data, 'r', fallback=fallback)

  with Timer() as overall_timer:
    L0_preds = []
    for feat, cl in zip(features, L0_cl):
      fm = store.get_FeatureMap('NewDocuments', feat)
      # We need to trim the fv as the feature space may have grown when we tokenized more documents.
      # Hydrat's design is such that new features are appended to the end of a feature space, so
      # we can safely truncate the feature map.
      train_feat_count = cl.metadata['train_feat_count']
      assert(train_feat_count <= fm.raw.shape[1])
      fv = fm.raw[:,:train_feat_count]
      with Timer() as cl_timer:
        pred = cl(fv)
        print >>sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(feat, cl_timer.elapsed, cl_timer.rate(fv.shape[0]))
      L0_preds.append(pred)
  ('nltkword_unigram',),
  feats_content,
  feats_content + ['headingord', 'headingvec'],
  feats_local,
  ['headingord', 'headingvec'] + feats_struct,
  feats_struct,
  feats_local + feats_prev,
  feats_local + feats_post,
  feats_local + feats_window,
  feats_local + feats_prev + feats_post,
  feats_local + feats_prev + feats_post + feats_window,
]

features = [ tuple(sorted(x)) for x in features ]
  
if __name__ == "__main__":
  fallback = Store('store/features.h5' )
  store = Store.from_caller(fallback=fallback)

  learner = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True))
  ds = ALTA2012Full()

  proxy = StackingProxy(ds, learner, store=store)
  proxy.class_space = 'ebmcat'
  for feats in features:
    print "DOING:", len(feats), feats
    proxy.feature_spaces = feats
    e = Experiment(proxy, learner)
    proxy.store.new_TaskSetResult(e)

Esempio n. 7
0
feats_position = ['positionabs','positionrel','positionrelbyte']


core = ( 'nltkword_unigram', 'treetaggerpos_bigram', 'treetaggerlemmapos_bigram', )
core += ('headingprev', 'headingvec', 'positionrel')
struct_best = core + ('bowpost1', 'bowprev3', 'headingpost', 'isstructured', 'sentlenrel',)
unstruct_best = core + ('bowprev','treetaggerpos_trigram','ttbprev')

feats_all  = tuple(sorted(set(sum(map(tuple, [struct_best, unstruct_best, feats_bow, feats_pos, feats_struct, feats_heading, feats_position]),tuple()))))

datasets = [
  ALTA2012Full(),
  ]

if __name__ == "__main__":
  store = Store.from_caller()

  for ds in datasets:
    proxy = DataProxy(ds, store=store)
    proxy.inducer.process(proxy.dataset, 
      fms=feats_all,
      cms=['ebmcat',], 
      sqs=['abstract',],
    )

    proxy.tokenstream_name = 'treetaggerlemmapos'
    proxy.tokenize(ext.bigram)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.bigram)
    proxy.tokenize(ext.trigram)
Esempio n. 8
0
                        help='existing hydrat Store generated by features.py')
    parser.add_argument("output",
                        help="produce output in PATH",
                        metavar="PATH")
    args = parser.parse_args()

    class_space = 'ebmcat'

    try:
        features = features.feature_sets[args.feats]
    except KeyError:
        parser.error("unknown feature group: {0}".format(args.feats))

    l = repeat.RepeatLearner(
        liblinear.liblinearL(svm_type=0, output_probability=True))
    store = Store(args.feat_store, 'r')  # TODO: Do we want this read-only?

    for feature in features:
        spaces[feature] = store.get_Space(feature)
    spaces['ebmcat'] = store.get_Space('ebmcat')

    proxy = DataProxy(ALTA2012Full(), store=store)
    proxy.class_space = class_space

    L0_cl = []
    L1_fv = []
    L1_gs = None
    for feat in features:
        proxy.feature_spaces = feat
        proxy.split_name = 'crossvalidation'
Esempio n. 9
0
feature_sets = {
  'all': feats_all,
  'core': core,
  'dev': ('headingprev', 'headingvec', 'positionrel'),
}

datasets = [
  ALTA2012Full(),
  ]

if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument('output', help='write output to PATH', metavar='PATH')
  args = parser.parse_args()

  store = Store(args.output, 'a')

  for ds in datasets:
    proxy = DataProxy(ds, store=store)
    proxy.inducer.process(proxy.dataset, 
      fms=feats_all,
      cms=['ebmcat',], 
      sqs=['abstract',],
      sps=['crossvalidation'],
    )

    proxy.tokenstream_name = 'treetaggerlemmapos'
    proxy.tokenize(ext.bigram)

    proxy.tokenstream_name = 'treetaggerpos'
    proxy.tokenize(ext.bigram)
Esempio n. 10
0
if __name__ == "__main__":
  parser = argparse.ArgumentParser()
  parser.add_argument('--feats', default='all', help="feature group to process")
  parser.add_argument('feat_store', help='existing hydrat Store generated by features.py')
  parser.add_argument("output", help="produce output in PATH", metavar="PATH")
  args = parser.parse_args()

  class_space = 'ebmcat'

  try:
    features = features.feature_sets[args.feats]
  except KeyError:
    parser.error("unknown feature group: {0}".format(args.feats))

  l = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True))
  store = Store(args.feat_store, 'r') # TODO: Do we want this read-only?

  for feature in features:
    spaces[feature] = store.get_Space(feature)
  spaces['ebmcat'] = store.get_Space('ebmcat')

  proxy = DataProxy(ALTA2012Full(), store=store)
  proxy.class_space = class_space
  
  L0_cl = []
  L1_fv = []
  L1_gs = None
  for feat in features: 
    proxy.feature_spaces = feat
    proxy.split_name = 'crossvalidation'
Esempio n. 11
0
import numpy as np
import scipy.sparse as sp

from common import Timer

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("model", help="read model from")
    parser.add_argument("data", help="store containing pre-tokenized data")
    parser.add_argument("feat", help="store containing feature data")
    parser.add_argument("output", help="write output to PATH", metavar="PATH")
    args = parser.parse_args()

    features, L0_cl, L1_cl = load(open(args.model))
    fallback = Store(args.feat, 'r')
    store = Store(args.data, 'r', fallback=fallback)

    with Timer() as overall_timer:
        L0_preds = []
        for feat, cl in zip(features, L0_cl):
            fm = store.get_FeatureMap('NewDocuments', feat)
            # We need to trim the fv as the feature space may have grown when we tokenized more documents.
            # Hydrat's design is such that new features are appended to the end of a feature space, so
            # we can safely truncate the feature map.
            train_feat_count = cl.metadata['train_feat_count']
            assert (train_feat_count <= fm.raw.shape[1])
            fv = fm.raw[:, :train_feat_count]
            with Timer() as cl_timer:
                pred = cl(fv)
                print >> sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(