Esempio n. 1
0
    def classify_batch(self, abstracts):
        """
    Only batch-mode classify is supported due to the underlying operations.
    Single-item classification is expected to be slow.
    @param abstracts a mapping from docid to a list of lines
    """
        self.__unpack()
        ts = {}
        for filename, lines in abstracts.iteritems():
            for i, line in enumerate(lines):
                docid = "{0}-{1}".format(filename, i + 1)
                ts[docid] = line

        try:
            handle, store_path = tempfile.mkstemp(dir=self.tempdir)
            os.close(handle)
            logger.debug("temporary store at {}".format(store_path))

            with Timer() as feat_timer:
                induce(ts, store_path, self.model.features, self.model.spaces)
                logger.info("computing features took {0:.2f}s".format(feat_timer.elapsed))

            store = Store(store_path, "r")

            with Timer() as cl_timer:
                L0_preds = []
                for feat, cl in zip(self.model.features, self.model.L0_cl):
                    fm = store.get_FeatureMap("NewDocuments", feat)
                    # We need to trim the fv as the feature space may have grown when we tokenized more documents.
                    # Hydrat's design is such that new features are appended to the end of a feature space, so
                    # we can safely truncate the feature map.
                    train_feat_count = cl.metadata["train_feat_count"]
                    assert train_feat_count <= fm.raw.shape[1]
                    L0_preds.append(cl(fm.raw[:, :train_feat_count]))

                L0_preds = sp.csr_matrix(np.hstack(L0_preds))
                L1_preds = self.model.L1_cl(L0_preds)

                logger.info(
                    "classification took {0:.2f}s ({1:.2f} inst/s)".format(
                        cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0])
                    )
                )

            cl_space = store.get_Space("ebmcat")
            instance_ids = store.get_Space("NewDocuments")
        finally:
            logger.debug("unlinking {}".format(store_path))
            os.unlink(store_path)

        return PIBOSOOutput(instance_ids, cl_space, L1_preds)
Esempio n. 2
0
    for feat, cl in zip(features, L0_cl):
      fm = store.get_FeatureMap('NewDocuments', feat)
      # We need to trim the fv as the feature space may have grown when we tokenized more documents.
      # Hydrat's design is such that new features are appended to the end of a feature space, so
      # we can safely truncate the feature map.
      train_feat_count = cl.metadata['train_feat_count']
      assert(train_feat_count <= fm.raw.shape[1])
      fv = fm.raw[:,:train_feat_count]
      with Timer() as cl_timer:
        pred = cl(fv)
        print >>sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(feat, cl_timer.elapsed, cl_timer.rate(fv.shape[0]))
      L0_preds.append(pred)

    L0_preds = sp.csr_matrix(np.hstack(L0_preds))

    with Timer() as cl_timer:
      L1_preds = L1_cl(L0_preds)
      print >>sys.stderr, "== L1 classify took {0:.2f}s ({1:.2f} inst/s)==".format(cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0]))
      
    print >>sys.stderr, "== classification took {0:.2f}s ({1:.2f} inst/s)==".format(overall_timer.elapsed, overall_timer.rate(L0_preds.shape[0]))

  cl_space = store.get_Space('ebmcat')
  instance_ids = store.get_Space('NewDocuments')

  with open(args.output,'w') as f:
    writer = csv.writer(f)
    for inst_id, cl_id in zip(instance_ids, L1_preds.argmax(axis=1)):
      cl_name = cl_space[cl_id]
      writer.writerow((inst_id, cl_name))
  
Esempio n. 3
0
  parser.add_argument('feat_store', help='existing hydrat Store generated by features.py')
  parser.add_argument("output", help="produce output in PATH", metavar="PATH")
  args = parser.parse_args()

  class_space = 'ebmcat'

  try:
    features = features.feature_sets[args.feats]
  except KeyError:
    parser.error("unknown feature group: {0}".format(args.feats))

  l = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True))
  store = Store(args.feat_store, 'r') # TODO: Do we want this read-only?

  for feature in features:
    spaces[feature] = store.get_Space(feature)
  spaces['ebmcat'] = store.get_Space('ebmcat')

  proxy = DataProxy(ALTA2012Full(), store=store)
  proxy.class_space = class_space
  
  L0_cl = []
  L1_fv = []
  L1_gs = None
  for feat in features: 
    proxy.feature_spaces = feat
    proxy.split_name = 'crossvalidation'

    with Timer() as L0_timer:
      L0_cl.append( l(proxy.featuremap.raw, proxy.classmap.raw) )
      print >>sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(feat, L0_timer.elapsed)
Esempio n. 4
0
                        metavar="PATH")
    args = parser.parse_args()

    class_space = 'ebmcat'

    try:
        features = features.feature_sets[args.feats]
    except KeyError:
        parser.error("unknown feature group: {0}".format(args.feats))

    l = repeat.RepeatLearner(
        liblinear.liblinearL(svm_type=0, output_probability=True))
    store = Store(args.feat_store, 'r')  # TODO: Do we want this read-only?

    for feature in features:
        spaces[feature] = store.get_Space(feature)
    spaces['ebmcat'] = store.get_Space('ebmcat')

    proxy = DataProxy(ALTA2012Full(), store=store)
    proxy.class_space = class_space

    L0_cl = []
    L1_fv = []
    L1_gs = None
    for feat in features:
        proxy.feature_spaces = feat
        proxy.split_name = 'crossvalidation'

        with Timer() as L0_timer:
            L0_cl.append(l(proxy.featuremap.raw, proxy.classmap.raw))
            print >> sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(
Esempio n. 5
0
            # We need to trim the fv as the feature space may have grown when we tokenized more documents.
            # Hydrat's design is such that new features are appended to the end of a feature space, so
            # we can safely truncate the feature map.
            train_feat_count = cl.metadata['train_feat_count']
            assert (train_feat_count <= fm.raw.shape[1])
            fv = fm.raw[:, :train_feat_count]
            with Timer() as cl_timer:
                pred = cl(fv)
                print >> sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(
                    feat, cl_timer.elapsed, cl_timer.rate(fv.shape[0]))
            L0_preds.append(pred)

        L0_preds = sp.csr_matrix(np.hstack(L0_preds))

        with Timer() as cl_timer:
            L1_preds = L1_cl(L0_preds)
            print >> sys.stderr, "== L1 classify took {0:.2f}s ({1:.2f} inst/s)==".format(
                cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0]))

        print >> sys.stderr, "== classification took {0:.2f}s ({1:.2f} inst/s)==".format(
            overall_timer.elapsed, overall_timer.rate(L0_preds.shape[0]))

    cl_space = store.get_Space('ebmcat')
    instance_ids = store.get_Space('NewDocuments')

    with open(args.output, 'w') as f:
        writer = csv.writer(f)
        for inst_id, cl_id in zip(instance_ids, L1_preds.argmax(axis=1)):
            cl_name = cl_space[cl_id]
            writer.writerow((inst_id, cl_name))