def classify_batch(self, abstracts): """ Only batch-mode classify is supported due to the underlying operations. Single-item classification is expected to be slow. @param abstracts a mapping from docid to a list of lines """ self.__unpack() ts = {} for filename, lines in abstracts.iteritems(): for i, line in enumerate(lines): docid = "{0}-{1}".format(filename, i + 1) ts[docid] = line try: handle, store_path = tempfile.mkstemp(dir=self.tempdir) os.close(handle) logger.debug("temporary store at {}".format(store_path)) with Timer() as feat_timer: induce(ts, store_path, self.model.features, self.model.spaces) logger.info("computing features took {0:.2f}s".format(feat_timer.elapsed)) store = Store(store_path, "r") with Timer() as cl_timer: L0_preds = [] for feat, cl in zip(self.model.features, self.model.L0_cl): fm = store.get_FeatureMap("NewDocuments", feat) # We need to trim the fv as the feature space may have grown when we tokenized more documents. # Hydrat's design is such that new features are appended to the end of a feature space, so # we can safely truncate the feature map. train_feat_count = cl.metadata["train_feat_count"] assert train_feat_count <= fm.raw.shape[1] L0_preds.append(cl(fm.raw[:, :train_feat_count])) L0_preds = sp.csr_matrix(np.hstack(L0_preds)) L1_preds = self.model.L1_cl(L0_preds) logger.info( "classification took {0:.2f}s ({1:.2f} inst/s)".format( cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0]) ) ) cl_space = store.get_Space("ebmcat") instance_ids = store.get_Space("NewDocuments") finally: logger.debug("unlinking {}".format(store_path)) os.unlink(store_path) return PIBOSOOutput(instance_ids, cl_space, L1_preds)
for feat, cl in zip(features, L0_cl): fm = store.get_FeatureMap('NewDocuments', feat) # We need to trim the fv as the feature space may have grown when we tokenized more documents. # Hydrat's design is such that new features are appended to the end of a feature space, so # we can safely truncate the feature map. train_feat_count = cl.metadata['train_feat_count'] assert(train_feat_count <= fm.raw.shape[1]) fv = fm.raw[:,:train_feat_count] with Timer() as cl_timer: pred = cl(fv) print >>sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(feat, cl_timer.elapsed, cl_timer.rate(fv.shape[0])) L0_preds.append(pred) L0_preds = sp.csr_matrix(np.hstack(L0_preds)) with Timer() as cl_timer: L1_preds = L1_cl(L0_preds) print >>sys.stderr, "== L1 classify took {0:.2f}s ({1:.2f} inst/s)==".format(cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0])) print >>sys.stderr, "== classification took {0:.2f}s ({1:.2f} inst/s)==".format(overall_timer.elapsed, overall_timer.rate(L0_preds.shape[0])) cl_space = store.get_Space('ebmcat') instance_ids = store.get_Space('NewDocuments') with open(args.output,'w') as f: writer = csv.writer(f) for inst_id, cl_id in zip(instance_ids, L1_preds.argmax(axis=1)): cl_name = cl_space[cl_id] writer.writerow((inst_id, cl_name))
parser.add_argument('feat_store', help='existing hydrat Store generated by features.py') parser.add_argument("output", help="produce output in PATH", metavar="PATH") args = parser.parse_args() class_space = 'ebmcat' try: features = features.feature_sets[args.feats] except KeyError: parser.error("unknown feature group: {0}".format(args.feats)) l = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True)) store = Store(args.feat_store, 'r') # TODO: Do we want this read-only? for feature in features: spaces[feature] = store.get_Space(feature) spaces['ebmcat'] = store.get_Space('ebmcat') proxy = DataProxy(ALTA2012Full(), store=store) proxy.class_space = class_space L0_cl = [] L1_fv = [] L1_gs = None for feat in features: proxy.feature_spaces = feat proxy.split_name = 'crossvalidation' with Timer() as L0_timer: L0_cl.append( l(proxy.featuremap.raw, proxy.classmap.raw) ) print >>sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(feat, L0_timer.elapsed)
metavar="PATH") args = parser.parse_args() class_space = 'ebmcat' try: features = features.feature_sets[args.feats] except KeyError: parser.error("unknown feature group: {0}".format(args.feats)) l = repeat.RepeatLearner( liblinear.liblinearL(svm_type=0, output_probability=True)) store = Store(args.feat_store, 'r') # TODO: Do we want this read-only? for feature in features: spaces[feature] = store.get_Space(feature) spaces['ebmcat'] = store.get_Space('ebmcat') proxy = DataProxy(ALTA2012Full(), store=store) proxy.class_space = class_space L0_cl = [] L1_fv = [] L1_gs = None for feat in features: proxy.feature_spaces = feat proxy.split_name = 'crossvalidation' with Timer() as L0_timer: L0_cl.append(l(proxy.featuremap.raw, proxy.classmap.raw)) print >> sys.stderr, "== training L0 for {0} took {1:.2f}s ==".format(
# We need to trim the fv as the feature space may have grown when we tokenized more documents. # Hydrat's design is such that new features are appended to the end of a feature space, so # we can safely truncate the feature map. train_feat_count = cl.metadata['train_feat_count'] assert (train_feat_count <= fm.raw.shape[1]) fv = fm.raw[:, :train_feat_count] with Timer() as cl_timer: pred = cl(fv) print >> sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format( feat, cl_timer.elapsed, cl_timer.rate(fv.shape[0])) L0_preds.append(pred) L0_preds = sp.csr_matrix(np.hstack(L0_preds)) with Timer() as cl_timer: L1_preds = L1_cl(L0_preds) print >> sys.stderr, "== L1 classify took {0:.2f}s ({1:.2f} inst/s)==".format( cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0])) print >> sys.stderr, "== classification took {0:.2f}s ({1:.2f} inst/s)==".format( overall_timer.elapsed, overall_timer.rate(L0_preds.shape[0])) cl_space = store.get_Space('ebmcat') instance_ids = store.get_Space('NewDocuments') with open(args.output, 'w') as f: writer = csv.writer(f) for inst_id, cl_id in zip(instance_ids, L1_preds.argmax(axis=1)): cl_name = cl_space[cl_id] writer.writerow((inst_id, cl_name))