def tokenize_extra(ds, store_path): """ Additional feature extraction for features that are not provided by the dataset implementation. @param ds dataset to read from @param store_path path of store to write to """ class_space = 'ebmcat' #print >>sys.stderr, "=== tokenize_extra for {0} ===".format(store_path) with closing(Store(store_path, 'a', recursive_close=False)) as store: proxy = DataProxy(ds, store=store) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.trigram) # Hackish workaround for store being unexpectedly closed with closing(Store(store_path, 'a', recursive_close=False)) as store: proxy = DataProxy(ds, store=store) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram)
def classify_batch(self, abstracts): """ Only batch-mode classify is supported due to the underlying operations. Single-item classification is expected to be slow. @param abstracts a mapping from docid to a list of lines """ self.__unpack() ts = {} for filename, lines in abstracts.iteritems(): for i, line in enumerate(lines): docid = "{0}-{1}".format(filename, i + 1) ts[docid] = line try: handle, store_path = tempfile.mkstemp(dir=self.tempdir) os.close(handle) logger.debug("temporary store at {}".format(store_path)) with Timer() as feat_timer: induce(ts, store_path, self.model.features, self.model.spaces) logger.info("computing features took {0:.2f}s".format(feat_timer.elapsed)) store = Store(store_path, "r") with Timer() as cl_timer: L0_preds = [] for feat, cl in zip(self.model.features, self.model.L0_cl): fm = store.get_FeatureMap("NewDocuments", feat) # We need to trim the fv as the feature space may have grown when we tokenized more documents. # Hydrat's design is such that new features are appended to the end of a feature space, so # we can safely truncate the feature map. train_feat_count = cl.metadata["train_feat_count"] assert train_feat_count <= fm.raw.shape[1] L0_preds.append(cl(fm.raw[:, :train_feat_count])) L0_preds = sp.csr_matrix(np.hstack(L0_preds)) L1_preds = self.model.L1_cl(L0_preds) logger.info( "classification took {0:.2f}s ({1:.2f} inst/s)".format( cl_timer.elapsed, cl_timer.rate(L0_preds.shape[0]) ) ) cl_space = store.get_Space("ebmcat") instance_ids = store.get_Space("NewDocuments") finally: logger.debug("unlinking {}".format(store_path)) os.unlink(store_path) return PIBOSOOutput(instance_ids, cl_space, L1_preds)
def induce(ts, store_path, features, spaces): """ Induce features for a list of abstracts. @param ts TokenStream (mapping from id to line) """ ds = NewDocuments(ts) # Merge feature spaces into the store with closing(Store(store_path, 'a')) as store: for space in spaces: md = {'name': space, 'type': 'feature'} store.add_Space(spaces[space], md) # We do the feature induction in a subprocess to avoid Python holding on to memory. for feature in features: tokenize(ds, [feature], store_path) #p = mp.Process(target=tokenize, args=(ds, [feature], store_path)) #p.start() #p.join() #p.terminate() tokenize_extra(ds, store_path)
def tokenize(ds, features, store_path): """ Compute feature values and save them in a hydrat store. @param ds dataset to read from @param features names of features to read @param store_path path of store to write to """ class_space = 'ebmcat' #print >>sys.stderr, "=== opening store at {0} ===".format(store_path) with closing(Store(store_path, 'a', recursive_close=False)) as store: #print >>sys.stderr, "=== inducing features ({0}) ===".format(features) # Induce all the features for the new test data proxy = DataProxy(ds, store=store) proxy.inducer.process( proxy.dataset, fms=features, sqs=[ 'abstract', ], )
import numpy as np import scipy.sparse as sp from common import Timer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("model", help="read model from") parser.add_argument("data", help="store containing pre-tokenized data") parser.add_argument("feat", help="store containing feature data") parser.add_argument("output", help="write output to PATH", metavar="PATH") args = parser.parse_args() features, L0_cl, L1_cl = load(open(args.model)) fallback = Store(args.feat, 'r') store = Store(args.data, 'r', fallback=fallback) with Timer() as overall_timer: L0_preds = [] for feat, cl in zip(features, L0_cl): fm = store.get_FeatureMap('NewDocuments', feat) # We need to trim the fv as the feature space may have grown when we tokenized more documents. # Hydrat's design is such that new features are appended to the end of a feature space, so # we can safely truncate the feature map. train_feat_count = cl.metadata['train_feat_count'] assert(train_feat_count <= fm.raw.shape[1]) fv = fm.raw[:,:train_feat_count] with Timer() as cl_timer: pred = cl(fv) print >>sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(feat, cl_timer.elapsed, cl_timer.rate(fv.shape[0])) L0_preds.append(pred)
('nltkword_unigram',), feats_content, feats_content + ['headingord', 'headingvec'], feats_local, ['headingord', 'headingvec'] + feats_struct, feats_struct, feats_local + feats_prev, feats_local + feats_post, feats_local + feats_window, feats_local + feats_prev + feats_post, feats_local + feats_prev + feats_post + feats_window, ] features = [ tuple(sorted(x)) for x in features ] if __name__ == "__main__": fallback = Store('store/features.h5' ) store = Store.from_caller(fallback=fallback) learner = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True)) ds = ALTA2012Full() proxy = StackingProxy(ds, learner, store=store) proxy.class_space = 'ebmcat' for feats in features: print "DOING:", len(feats), feats proxy.feature_spaces = feats e = Experiment(proxy, learner) proxy.store.new_TaskSetResult(e)
feats_position = ['positionabs','positionrel','positionrelbyte'] core = ( 'nltkword_unigram', 'treetaggerpos_bigram', 'treetaggerlemmapos_bigram', ) core += ('headingprev', 'headingvec', 'positionrel') struct_best = core + ('bowpost1', 'bowprev3', 'headingpost', 'isstructured', 'sentlenrel',) unstruct_best = core + ('bowprev','treetaggerpos_trigram','ttbprev') feats_all = tuple(sorted(set(sum(map(tuple, [struct_best, unstruct_best, feats_bow, feats_pos, feats_struct, feats_heading, feats_position]),tuple())))) datasets = [ ALTA2012Full(), ] if __name__ == "__main__": store = Store.from_caller() for ds in datasets: proxy = DataProxy(ds, store=store) proxy.inducer.process(proxy.dataset, fms=feats_all, cms=['ebmcat',], sqs=['abstract',], ) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram) proxy.tokenize(ext.trigram)
help='existing hydrat Store generated by features.py') parser.add_argument("output", help="produce output in PATH", metavar="PATH") args = parser.parse_args() class_space = 'ebmcat' try: features = features.feature_sets[args.feats] except KeyError: parser.error("unknown feature group: {0}".format(args.feats)) l = repeat.RepeatLearner( liblinear.liblinearL(svm_type=0, output_probability=True)) store = Store(args.feat_store, 'r') # TODO: Do we want this read-only? for feature in features: spaces[feature] = store.get_Space(feature) spaces['ebmcat'] = store.get_Space('ebmcat') proxy = DataProxy(ALTA2012Full(), store=store) proxy.class_space = class_space L0_cl = [] L1_fv = [] L1_gs = None for feat in features: proxy.feature_spaces = feat proxy.split_name = 'crossvalidation'
feature_sets = { 'all': feats_all, 'core': core, 'dev': ('headingprev', 'headingvec', 'positionrel'), } datasets = [ ALTA2012Full(), ] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('output', help='write output to PATH', metavar='PATH') args = parser.parse_args() store = Store(args.output, 'a') for ds in datasets: proxy = DataProxy(ds, store=store) proxy.inducer.process(proxy.dataset, fms=feats_all, cms=['ebmcat',], sqs=['abstract',], sps=['crossvalidation'], ) proxy.tokenstream_name = 'treetaggerlemmapos' proxy.tokenize(ext.bigram) proxy.tokenstream_name = 'treetaggerpos' proxy.tokenize(ext.bigram)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--feats', default='all', help="feature group to process") parser.add_argument('feat_store', help='existing hydrat Store generated by features.py') parser.add_argument("output", help="produce output in PATH", metavar="PATH") args = parser.parse_args() class_space = 'ebmcat' try: features = features.feature_sets[args.feats] except KeyError: parser.error("unknown feature group: {0}".format(args.feats)) l = repeat.RepeatLearner(liblinear.liblinearL(svm_type=0, output_probability=True)) store = Store(args.feat_store, 'r') # TODO: Do we want this read-only? for feature in features: spaces[feature] = store.get_Space(feature) spaces['ebmcat'] = store.get_Space('ebmcat') proxy = DataProxy(ALTA2012Full(), store=store) proxy.class_space = class_space L0_cl = [] L1_fv = [] L1_gs = None for feat in features: proxy.feature_spaces = feat proxy.split_name = 'crossvalidation'
import numpy as np import scipy.sparse as sp from common import Timer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("model", help="read model from") parser.add_argument("data", help="store containing pre-tokenized data") parser.add_argument("feat", help="store containing feature data") parser.add_argument("output", help="write output to PATH", metavar="PATH") args = parser.parse_args() features, L0_cl, L1_cl = load(open(args.model)) fallback = Store(args.feat, 'r') store = Store(args.data, 'r', fallback=fallback) with Timer() as overall_timer: L0_preds = [] for feat, cl in zip(features, L0_cl): fm = store.get_FeatureMap('NewDocuments', feat) # We need to trim the fv as the feature space may have grown when we tokenized more documents. # Hydrat's design is such that new features are appended to the end of a feature space, so # we can safely truncate the feature map. train_feat_count = cl.metadata['train_feat_count'] assert (train_feat_count <= fm.raw.shape[1]) fv = fm.raw[:, :train_feat_count] with Timer() as cl_timer: pred = cl(fv) print >> sys.stderr, "== L1 feat for {0} took {1:.2f}s ({2:.2f} inst/s) ==".format(