def BinyangUse(sample_path, model_path='model_nb', tokenize='False'): #just for test sents = [] if not os.path.isfile(sample_path): sbd_util.die('test path [%s] does not exist' % sample_path) ## labeled data data_root = '/u/dgillick/workspace/sbd/' brown_data = data_root + 'whiskey/brown.1' wsj_data = data_root + 'whiskey/satz.1' poe_data = data_root + 'whiskey/poe.1' new_wsj_data = data_root + 'whiskey/wsj.1' ## install root install_root = 'C:\\Python27\\Lib\\splitta\\' #-----set parameters out = None if not model_path.endswith('/'): model_path += '/' if not os.path.isdir(model_path): model_path = install_root + model_path if not os.path.isdir(model_path): sbd_util.die('model path [%s] does not exist' % model_path) svm = False if 'svm' in model_path: svm = True model = load_sbd_model(model_path, svm) #------------------- test = get_data(sample_path, tokenize=True) test.featurize(model, verbose=True) model.classify(test, verbose=True) sents = test.segment(use_preds=True, tokenize=tokenize, output=out) return sents
def BinyangUse(sample_path,model_path='model_nb',tokenize='False'): #just for test sents=[] if not os.path.isfile(sample_path): sbd_util.die('test path [%s] does not exist' %sample_path) ## labeled data data_root = '/u/dgillick/workspace/sbd/' brown_data = data_root + 'whiskey/brown.1' wsj_data = data_root + 'whiskey/satz.1' poe_data = data_root + 'whiskey/poe.1' new_wsj_data = data_root + 'whiskey/wsj.1' ## install root install_root = 'C:\\Python27\\Lib\\splitta\\' #-----set parameters out=None if not model_path.endswith('/'): model_path += '/' if not os.path.isdir(model_path): model_path = install_root + model_path if not os.path.isdir(model_path): sbd_util.die('model path [%s] does not exist' %model_path) svm=False if 'svm' in model_path: svm = True model = load_sbd_model(model_path, svm) #------------------- test = get_data(sample_path, tokenize=True) test.featurize(model, verbose=True) model.classify(test, verbose=True) sents=test.segment(use_preds=True, tokenize=tokenize, output=out) return sents
def classify(self, doc, verbose=False): model_file = '%ssvm_model' % self.path if not self.feats: sbd_util.die('Incomplete model') if not os.path.isfile(model_file): sbd_util.die('no model [%s]' % model_file) ## testing data file if verbose: sys.stderr.write('SVM classifying... ') lines = [] frag = doc.frag while frag: if frag.label == None: svm_label = '0' elif frag.label: svm_label = '+1' else: svm_label = '-1' line = '%s ' % svm_label feats = [f + '_' + v for f, v in frag.features.items()] svm_feats = [self.feats[f] for f in feats if f in self.feats] svm_feats.sort(lambda x, y: x - y) line += ' '.join(['%d:1' % x for x in svm_feats]) lines.append(line) frag = frag.next #print "!----!",get_open_fds() #unused, test_file = tempfile.mkstemp() test_file = "tmp1" fh = open(test_file, 'w') fh.write('\n'.join(lines) + '\n') fh.close() #print "!----!",get_open_fds() #unused, pred_file = tempfile.mkstemp() pred_file = "tmp2" options = '-v 0' cmd = '%s %s %s %s %s' % (SVM_CLASSIFY, options, test_file, model_file, pred_file) os.system(cmd) ## get predictions total = 0 pf = open(pred_file, 'r') #print pf preds = map(float, pf.read().splitlines()) frag = doc.frag while frag: frag.pred = sbd_util.logit(preds[total]) frag = frag.next total += 1 ## clean up pf.close() os.remove(test_file) os.remove(pred_file) if verbose: sys.stderr.write('done!\n')
def classify(self, doc, verbose=False): model_file = '%ssvm_model' %self.path if not self.feats: sbd_util.die('Incomplete model') if not os.path.isfile(model_file): sbd_util.die('no model [%s]' %model_file) ## testing data file if verbose: sys.stderr.write('SVM classifying... ') lines = [] frag = doc.frag while frag: if frag.label == None: svm_label = '0' elif frag.label: svm_label = '+1' else: svm_label = '-1' line = '%s ' %svm_label feats = [f+'_'+v for f,v in frag.features.items()] svm_feats = [self.feats[f] for f in feats if f in self.feats] svm_feats.sort(lambda x,y: x-y) line += ' '.join(['%d:1' %x for x in svm_feats]) lines.append(line) frag = frag.next #print "!----!",get_open_fds() #unused, test_file = tempfile.mkstemp() test_file = "tmp1" fh = open(test_file, 'w') fh.write('\n'.join(lines) + '\n') fh.close() #print "!----!",get_open_fds() #unused, pred_file = tempfile.mkstemp() pred_file = "tmp2" options = '-v 0' cmd = '%s %s %s %s %s' %(SVM_CLASSIFY, options, test_file, model_file, pred_file) os.system(cmd) ## get predictions total = 0 pf = open(pred_file,'r') #print pf preds = map(float, pf.read().splitlines()) frag = doc.frag while frag: frag.pred = sbd_util.logit(preds[total]) frag = frag.next total += 1 ## clean up pf.close() os.remove(test_file) os.remove(pred_file) if verbose: sys.stderr.write('done!\n')
def train(self, doc): """ takes training data and a path and creates an svm model """ model_file = '%ssvm_model' % self.path ## need integer dictionary for features sys.stderr.write('training. making feat dict... ') feat_list = set() frag = doc.frag while frag: feats = [f + '_' + v for f, v in frag.features.items()] for feat in feats: feat_list.add(feat) frag = frag.next self.feats = dict(zip(feat_list, range(1, len(feat_list) + 1))) ## training data file sys.stderr.write('writing... ') lines = [] frag = doc.frag while frag: if frag.label == None: sbd_util.die('expecting labeled data [%s]' % frag) elif frag.label > 0.5: svm_label = '+1' elif frag.label < 0.5: svm_label = '-1' else: continue line = '%s ' % svm_label feats = [f + '_' + v for f, v in frag.features.items()] svm_feats = [self.feats[f] for f in feats] svm_feats.sort(lambda x, y: x - y) line += ' '.join(['%d:1' % x for x in svm_feats]) lines.append(line) frag = frag.next unused, train_file = tempfile.mkstemp() fh = open(train_file, 'w') fh.write('\n'.join(lines) + '\n') fh.close() ## train an svm model sys.stderr.write('running svm... ') options = '-c 1 -v 0' cmd = '%s %s %s %s' % (SVM_LEARN, options, train_file, model_file) os.system(cmd) sys.stderr.write('done!\n') ## clean up os.remove(train_file)
def train(self, doc): """ takes training data and a path and creates an svm model """ model_file = '%ssvm_model' %self.path ## need integer dictionary for features sys.stderr.write('training. making feat dict... ') feat_list = set() frag = doc.frag while frag: feats = [f+'_'+v for f,v in frag.features.items()] for feat in feats: feat_list.add(feat) frag = frag.next self.feats = dict(zip(feat_list, range(1,len(feat_list)+1))) ## training data file sys.stderr.write('writing... ') lines = [] frag = doc.frag while frag: if frag.label == None: sbd_util.die('expecting labeled data [%s]' %frag) elif frag.label > 0.5: svm_label = '+1' elif frag.label < 0.5: svm_label = '-1' else: continue line = '%s ' %svm_label feats = [f+'_'+v for f,v in frag.features.items()] svm_feats = [self.feats[f] for f in feats] svm_feats.sort(lambda x,y: x-y) line += ' '.join(['%d:1' %x for x in svm_feats]) lines.append(line) frag = frag.next unused, train_file = tempfile.mkstemp() fh = open(train_file, 'w') fh.write('\n'.join(lines) + '\n') fh.close() ## train an svm model sys.stderr.write('running svm... ') options = '-c 1 -v 0' cmd = '%s %s %s %s' %(SVM_LEARN, options, train_file, model_file) os.system(cmd) sys.stderr.write('done!\n') ## clean up os.remove(train_file)
type='str', default=None, help='train a new model using this labeled data file') parser.add_option('-c', '--svm', dest='svm', default=False, action='store_true', help='use SVM instead of Naive Bayes for training') (options, args) = parser.parse_args() ## get test file if len(args) > 0: options.test = args[0] if not os.path.isfile(options.test): sbd_util.die('test path [%s] does not exist' % options.test) else: options.test = None if not options.train: sbd_util.die('you did not specify either train or test!') ## create model path if not options.model_path.endswith('/'): options.model_path += '/' if options.train: if not os.path.isfile(options.train): sbd_util.die('model path [%s] does not exist' % options.train) if os.path.isdir(options.model_path): sbd_util.die('model path [%s] already exists' % options.model_path) else: os.mkdir(options.model_path) else:
parser.add_option('-t', '--tokenize', dest='tokenize', default=False, action='store_true', help='write tokenized output') parser.add_option('-m', '--model', dest='model_path', type='str', default='model_nb', help='model path') parser.add_option('-o', '--output', dest='output', type='str', default=None, help='write sentences to this file') parser.add_option('-x', '--train', dest='train', type='str', default=None, help='train a new model using this labeled data file') parser.add_option('-c', '--svm', dest='svm', default=False, action='store_true', help='use SVM instead of Naive Bayes for training') (options, args) = parser.parse_args() ## get test file if len(args) > 0: options.test = args[0] if not os.path.isfile(options.test): sbd_util.die('test path [%s] does not exist' %options.test) else: options.test = None if not options.train: sbd_util.die('you did not specify either train or test!') ## create model path if not options.model_path.endswith('/'): options.model_path += '/' if options.train: if not os.path.isfile(options.train): sbd_util.die('model path [%s] does not exist' %options.train) if os.path.isdir(options.model_path): sbd_util.die('model path [%s] already exists' %options.model_path) else: os.mkdir(options.model_path) else: if not os.path.isdir(options.model_path): options.model_path = install_root + options.model_path if not os.path.isdir(options.model_path): sbd_util.die('model path [%s] does not exist' %options.model_path)
def poo(): ## labeled data data_root = '/u/dgillick/workspace/sbd/' brown_data = data_root + 'whiskey/brown.1' wsj_data = data_root + 'whiskey/satz.1' poe_data = data_root + 'whiskey/poe.1' new_wsj_data = data_root + 'whiskey/wsj.1' ## install root install_root = '/home/chonger/Downloads/splitta/' ## options from optparse import OptionParser usage = 'usage: %prog [options] <text_file>' parser = OptionParser(usage=usage) parser.add_option('-v', '--verbose', dest='verbose', default=False, action='store_true', help='verbose output') parser.add_option('-t', '--tokenize', dest='tokenize', default=False, action='store_true', help='write tokenized output') parser.add_option('-m', '--model', dest='model_path', type='str', default='model_nb', help='model path') parser.add_option('-o', '--output', dest='output', type='str', default=None, help='write sentences to this file') parser.add_option('-x', '--train', dest='train', type='str', default=None, help='train a new model using this labeled data file') parser.add_option('-c', '--svm', dest='svm', default=False, action='store_true', help='use SVM instead of Naive Bayes for training') (options, args) = parser.parse_args() ## get test file if len(args) > 0: options.test = args[0] if not os.path.isfile(options.test): sbd_util.die('test path [%s] does not exist' %options.test) else: options.test = None if not options.train: sbd_util.die('you did not specify either train or test!') ## create model path if not options.model_path.endswith('/'): options.model_path += '/' if options.train: if not os.path.isfile(options.train): sbd_util.die('model path [%s] does not exist' %options.train) if os.path.isdir(options.model_path): sbd_util.die('model path [%s] already exists' %options.model_path) else: os.mkdir(options.model_path) else: if not os.path.isdir(options.model_path): options.model_path = install_root + options.model_path if not os.path.isdir(options.model_path): sbd_util.die('model path [%s] does not exist' %options.model_path) ## create a model if options.train: model = build_model(options.train, options) if not options.test: sys.exit() print options.svm print options.test ## test if not options.train: if 'svm' in options.model_path: options.svm = True model = load_sbd_model(options.model_path, options.svm) if options.output: options.output = open(options.output, 'w') test = get_data(options.test, tokenize=True) test.featurize(model, verbose=True) model.classify(test, verbose=True) print options.tokenize print options.output test.segment(use_preds=True, tokenize=options.tokenize, output=options.output)
def poo(): ## labeled data data_root = '/u/dgillick/workspace/sbd/' brown_data = data_root + 'whiskey/brown.1' wsj_data = data_root + 'whiskey/satz.1' poe_data = data_root + 'whiskey/poe.1' new_wsj_data = data_root + 'whiskey/wsj.1' ## install root install_root = '/home/chonger/Downloads/splitta/' ## options from optparse import OptionParser usage = 'usage: %prog [options] <text_file>' parser = OptionParser(usage=usage) parser.add_option('-v', '--verbose', dest='verbose', default=False, action='store_true', help='verbose output') parser.add_option('-t', '--tokenize', dest='tokenize', default=False, action='store_true', help='write tokenized output') parser.add_option('-m', '--model', dest='model_path', type='str', default='model_nb', help='model path') parser.add_option('-o', '--output', dest='output', type='str', default=None, help='write sentences to this file') parser.add_option('-x', '--train', dest='train', type='str', default=None, help='train a new model using this labeled data file') parser.add_option('-c', '--svm', dest='svm', default=False, action='store_true', help='use SVM instead of Naive Bayes for training') (options, args) = parser.parse_args() ## get test file if len(args) > 0: options.test = args[0] if not os.path.isfile(options.test): sbd_util.die('test path [%s] does not exist' % options.test) else: options.test = None if not options.train: sbd_util.die('you did not specify either train or test!') ## create model path if not options.model_path.endswith('/'): options.model_path += '/' if options.train: if not os.path.isfile(options.train): sbd_util.die('model path [%s] does not exist' % options.train) if os.path.isdir(options.model_path): sbd_util.die('model path [%s] already exists' % options.model_path) else: os.mkdir(options.model_path) else: if not os.path.isdir(options.model_path): options.model_path = install_root + options.model_path if not os.path.isdir(options.model_path): sbd_util.die('model path [%s] does not exist' % options.model_path) ## create a model if options.train: model = build_model(options.train, options) if not options.test: sys.exit() print options.svm print options.test ## test if not options.train: if 'svm' in options.model_path: options.svm = True model = load_sbd_model(options.model_path, options.svm) if options.output: options.output = open(options.output, 'w') test = get_data(options.test, tokenize=True) test.featurize(model, verbose=True) model.classify(test, verbose=True) print options.tokenize print options.output test.segment(use_preds=True, tokenize=options.tokenize, output=options.output)