Beispiel #1
0
def BinyangUse(sample_path,
               model_path='model_nb',
               tokenize='False'):  #just for test
    sents = []
    if not os.path.isfile(sample_path):
        sbd_util.die('test path [%s] does not exist' % sample_path)
    ## labeled data
    data_root = '/u/dgillick/workspace/sbd/'
    brown_data = data_root + 'whiskey/brown.1'
    wsj_data = data_root + 'whiskey/satz.1'
    poe_data = data_root + 'whiskey/poe.1'
    new_wsj_data = data_root + 'whiskey/wsj.1'

    ## install root
    install_root = 'C:\\Python27\\Lib\\splitta\\'

    #-----set parameters
    out = None
    if not model_path.endswith('/'): model_path += '/'
    if not os.path.isdir(model_path):
        model_path = install_root + model_path
        if not os.path.isdir(model_path):
            sbd_util.die('model path [%s] does not exist' % model_path)
    svm = False
    if 'svm' in model_path: svm = True
    model = load_sbd_model(model_path, svm)
    #-------------------
    test = get_data(sample_path, tokenize=True)
    test.featurize(model, verbose=True)
    model.classify(test, verbose=True)
    sents = test.segment(use_preds=True, tokenize=tokenize, output=out)

    return sents
Beispiel #2
0
def BinyangUse(sample_path,model_path='model_nb',tokenize='False'): #just for test
    sents=[]
    if not os.path.isfile(sample_path): sbd_util.die('test path [%s] does not exist' %sample_path)
    ## labeled data
    data_root = '/u/dgillick/workspace/sbd/'
    brown_data = data_root + 'whiskey/brown.1'
    wsj_data = data_root + 'whiskey/satz.1'
    poe_data = data_root + 'whiskey/poe.1'
    new_wsj_data = data_root + 'whiskey/wsj.1'

    ## install root
    install_root = 'C:\\Python27\\Lib\\splitta\\'
    
    #-----set parameters
    out=None
    if not model_path.endswith('/'): model_path += '/'
    if not os.path.isdir(model_path):
            model_path = install_root + model_path
            if not os.path.isdir(model_path):
                sbd_util.die('model path [%s] does not exist' %model_path)
    svm=False
    if 'svm' in model_path: svm = True
    model = load_sbd_model(model_path, svm)
    #-------------------
    test = get_data(sample_path, tokenize=True)
    test.featurize(model, verbose=True)
    model.classify(test, verbose=True)
    sents=test.segment(use_preds=True, tokenize=tokenize, output=out)
    
    
    return sents
Beispiel #3
0
    def classify(self, doc, verbose=False):

        model_file = '%ssvm_model' % self.path
        if not self.feats: sbd_util.die('Incomplete model')
        if not os.path.isfile(model_file):
            sbd_util.die('no model [%s]' % model_file)

        ## testing data file
        if verbose: sys.stderr.write('SVM classifying... ')
        lines = []
        frag = doc.frag
        while frag:
            if frag.label == None: svm_label = '0'
            elif frag.label: svm_label = '+1'
            else: svm_label = '-1'
            line = '%s ' % svm_label
            feats = [f + '_' + v for f, v in frag.features.items()]
            svm_feats = [self.feats[f] for f in feats if f in self.feats]
            svm_feats.sort(lambda x, y: x - y)
            line += ' '.join(['%d:1' % x for x in svm_feats])
            lines.append(line)
            frag = frag.next

        #print "!----!",get_open_fds()
        #unused, test_file = tempfile.mkstemp()
        test_file = "tmp1"
        fh = open(test_file, 'w')
        fh.write('\n'.join(lines) + '\n')
        fh.close()
        #print "!----!",get_open_fds()

        #unused, pred_file = tempfile.mkstemp()
        pred_file = "tmp2"
        options = '-v 0'
        cmd = '%s %s %s %s %s' % (SVM_CLASSIFY, options, test_file, model_file,
                                  pred_file)
        os.system(cmd)

        ## get predictions
        total = 0
        pf = open(pred_file, 'r')
        #print pf
        preds = map(float, pf.read().splitlines())
        frag = doc.frag
        while frag:
            frag.pred = sbd_util.logit(preds[total])
            frag = frag.next
            total += 1

        ## clean up
        pf.close()
        os.remove(test_file)
        os.remove(pred_file)

        if verbose: sys.stderr.write('done!\n')
Beispiel #4
0
    def classify(self, doc, verbose=False):

        model_file = '%ssvm_model' %self.path
        if not self.feats: sbd_util.die('Incomplete model')
        if not os.path.isfile(model_file): sbd_util.die('no model [%s]' %model_file)

        ## testing data file
        if verbose: sys.stderr.write('SVM classifying... ')
        lines = []
        frag = doc.frag
        while frag:
            if frag.label == None: svm_label = '0'
            elif frag.label: svm_label = '+1'
            else: svm_label = '-1'
            line = '%s ' %svm_label
            feats = [f+'_'+v for f,v in frag.features.items()]
            svm_feats = [self.feats[f] for f in feats if f in self.feats]
            svm_feats.sort(lambda x,y: x-y)
            line += ' '.join(['%d:1' %x for x in svm_feats])
            lines.append(line)
            frag = frag.next

        #print "!----!",get_open_fds()
        #unused, test_file = tempfile.mkstemp()
        test_file = "tmp1"
        fh = open(test_file, 'w')
        fh.write('\n'.join(lines) + '\n')
        fh.close()
        #print "!----!",get_open_fds()

    
        #unused, pred_file = tempfile.mkstemp()
        pred_file = "tmp2"
        options = '-v 0'
        cmd = '%s %s %s %s %s' %(SVM_CLASSIFY, options, test_file, model_file, pred_file)
        os.system(cmd)

        ## get predictions
        total = 0
        pf = open(pred_file,'r')
        #print pf
        preds = map(float, pf.read().splitlines())
        frag = doc.frag
        while frag:
            frag.pred = sbd_util.logit(preds[total])
            frag = frag.next
            total += 1

        ## clean up
        pf.close()
        os.remove(test_file)
        os.remove(pred_file)
        
        if verbose: sys.stderr.write('done!\n')
Beispiel #5
0
    def train(self, doc):
        """
        takes training data and a path and creates an svm model
        """

        model_file = '%ssvm_model' % self.path

        ## need integer dictionary for features
        sys.stderr.write('training. making feat dict... ')
        feat_list = set()
        frag = doc.frag
        while frag:
            feats = [f + '_' + v for f, v in frag.features.items()]
            for feat in feats:
                feat_list.add(feat)
            frag = frag.next
        self.feats = dict(zip(feat_list, range(1, len(feat_list) + 1)))

        ## training data file
        sys.stderr.write('writing... ')
        lines = []
        frag = doc.frag
        while frag:
            if frag.label == None:
                sbd_util.die('expecting labeled data [%s]' % frag)
            elif frag.label > 0.5:
                svm_label = '+1'
            elif frag.label < 0.5:
                svm_label = '-1'
            else:
                continue
            line = '%s ' % svm_label
            feats = [f + '_' + v for f, v in frag.features.items()]
            svm_feats = [self.feats[f] for f in feats]
            svm_feats.sort(lambda x, y: x - y)
            line += ' '.join(['%d:1' % x for x in svm_feats])
            lines.append(line)
            frag = frag.next

        unused, train_file = tempfile.mkstemp()
        fh = open(train_file, 'w')
        fh.write('\n'.join(lines) + '\n')
        fh.close()

        ## train an svm model
        sys.stderr.write('running svm... ')
        options = '-c 1 -v 0'
        cmd = '%s %s %s %s' % (SVM_LEARN, options, train_file, model_file)
        os.system(cmd)
        sys.stderr.write('done!\n')

        ## clean up
        os.remove(train_file)
Beispiel #6
0
    def train(self, doc):
        """
        takes training data and a path and creates an svm model
        """

        model_file = '%ssvm_model' %self.path

        ## need integer dictionary for features
        sys.stderr.write('training. making feat dict... ')
        feat_list = set()
        frag = doc.frag
        while frag:
            feats = [f+'_'+v for f,v in frag.features.items()]
            for feat in feats: feat_list.add(feat)
            frag = frag.next
        self.feats = dict(zip(feat_list, range(1,len(feat_list)+1)))

        ## training data file
        sys.stderr.write('writing... ')
        lines = []
        frag = doc.frag
        while frag:
            if frag.label == None: sbd_util.die('expecting labeled data [%s]' %frag)
            elif frag.label > 0.5: svm_label = '+1'
            elif frag.label < 0.5: svm_label = '-1'
            else: continue
            line = '%s ' %svm_label
            feats = [f+'_'+v for f,v in frag.features.items()]
            svm_feats = [self.feats[f] for f in feats]
            svm_feats.sort(lambda x,y: x-y)
            line += ' '.join(['%d:1' %x for x in svm_feats])
            lines.append(line)
            frag = frag.next

        unused, train_file = tempfile.mkstemp()
        fh = open(train_file, 'w')
        fh.write('\n'.join(lines) + '\n')
        fh.close()
    
        ## train an svm model
        sys.stderr.write('running svm... ')
        options = '-c 1 -v 0'
        cmd = '%s %s %s %s' %(SVM_LEARN, options, train_file, model_file)
        os.system(cmd)
        sys.stderr.write('done!\n')

        ## clean up
        os.remove(train_file)
Beispiel #7
0
                      type='str',
                      default=None,
                      help='train a new model using this labeled data file')
    parser.add_option('-c',
                      '--svm',
                      dest='svm',
                      default=False,
                      action='store_true',
                      help='use SVM instead of Naive Bayes for training')
    (options, args) = parser.parse_args()

    ## get test file
    if len(args) > 0:
        options.test = args[0]
        if not os.path.isfile(options.test):
            sbd_util.die('test path [%s] does not exist' % options.test)
    else:
        options.test = None
        if not options.train:
            sbd_util.die('you did not specify either train or test!')

    ## create model path
    if not options.model_path.endswith('/'): options.model_path += '/'
    if options.train:
        if not os.path.isfile(options.train):
            sbd_util.die('model path [%s] does not exist' % options.train)
        if os.path.isdir(options.model_path):
            sbd_util.die('model path [%s] already exists' % options.model_path)
        else:
            os.mkdir(options.model_path)
    else:
Beispiel #8
0
    parser.add_option('-t', '--tokenize', dest='tokenize', default=False,
                      action='store_true', help='write tokenized output')
    parser.add_option('-m', '--model', dest='model_path', type='str', default='model_nb',
                      help='model path')
    parser.add_option('-o', '--output', dest='output', type='str', default=None,
                      help='write sentences to this file')
    parser.add_option('-x', '--train', dest='train', type='str', default=None,
                      help='train a new model using this labeled data file')
    parser.add_option('-c', '--svm', dest='svm', default=False,
                      action='store_true', help='use SVM instead of Naive Bayes for training')
    (options, args) = parser.parse_args()

    ## get test file
    if len(args) > 0:
        options.test = args[0]
        if not os.path.isfile(options.test): sbd_util.die('test path [%s] does not exist' %options.test)
    else:
        options.test = None
        if not options.train: sbd_util.die('you did not specify either train or test!')

    ## create model path
    if not options.model_path.endswith('/'): options.model_path += '/'
    if options.train:
        if not os.path.isfile(options.train): sbd_util.die('model path [%s] does not exist' %options.train)
        if os.path.isdir(options.model_path): sbd_util.die('model path [%s] already exists' %options.model_path)
        else: os.mkdir(options.model_path)
    else:
        if not os.path.isdir(options.model_path):
            options.model_path = install_root + options.model_path
            if not os.path.isdir(options.model_path):
                sbd_util.die('model path [%s] does not exist' %options.model_path)
Beispiel #9
0
def poo():
    

    ## labeled data
    data_root = '/u/dgillick/workspace/sbd/'
    brown_data = data_root + 'whiskey/brown.1'
    wsj_data = data_root + 'whiskey/satz.1'
    poe_data = data_root + 'whiskey/poe.1'
    new_wsj_data = data_root + 'whiskey/wsj.1'

    ## install root
    install_root = '/home/chonger/Downloads/splitta/'

    ## options
    from optparse import OptionParser
    usage = 'usage: %prog [options] <text_file>'
    parser = OptionParser(usage=usage)
    parser.add_option('-v', '--verbose', dest='verbose', default=False,
                      action='store_true', help='verbose output')
    parser.add_option('-t', '--tokenize', dest='tokenize', default=False,
                      action='store_true', help='write tokenized output')
    parser.add_option('-m', '--model', dest='model_path', type='str', default='model_nb',
                      help='model path')
    parser.add_option('-o', '--output', dest='output', type='str', default=None,
                      help='write sentences to this file')
    parser.add_option('-x', '--train', dest='train', type='str', default=None,
                      help='train a new model using this labeled data file')
    parser.add_option('-c', '--svm', dest='svm', default=False,
                      action='store_true', help='use SVM instead of Naive Bayes for training')
    (options, args) = parser.parse_args()

    ## get test file
    if len(args) > 0:
        options.test = args[0]
        if not os.path.isfile(options.test): sbd_util.die('test path [%s] does not exist' %options.test)
    else:
        options.test = None
        if not options.train: sbd_util.die('you did not specify either train or test!')

    ## create model path
    if not options.model_path.endswith('/'): options.model_path += '/'
    if options.train:
        if not os.path.isfile(options.train): sbd_util.die('model path [%s] does not exist' %options.train)
        if os.path.isdir(options.model_path): sbd_util.die('model path [%s] already exists' %options.model_path)
        else: os.mkdir(options.model_path)
    else:
        if not os.path.isdir(options.model_path):
            options.model_path = install_root + options.model_path
            if not os.path.isdir(options.model_path):
                sbd_util.die('model path [%s] does not exist' %options.model_path)

    ## create a model
    if options.train:
        model = build_model(options.train, options)

    if not options.test: sys.exit()

    print options.svm
    print options.test
    
    ## test
    if not options.train:
        if 'svm' in options.model_path: options.svm = True
        model = load_sbd_model(options.model_path, options.svm)
    if options.output: options.output = open(options.output, 'w')

    test = get_data(options.test, tokenize=True)
    test.featurize(model, verbose=True)
    model.classify(test, verbose=True)
    print options.tokenize
    print options.output
    test.segment(use_preds=True, tokenize=options.tokenize, output=options.output)
Beispiel #10
0
    parser.add_option('-t', '--tokenize', dest='tokenize', default=False,
                      action='store_true', help='write tokenized output')
    parser.add_option('-m', '--model', dest='model_path', type='str', default='model_nb',
                      help='model path')
    parser.add_option('-o', '--output', dest='output', type='str', default=None,
                      help='write sentences to this file')
    parser.add_option('-x', '--train', dest='train', type='str', default=None,
                      help='train a new model using this labeled data file')
    parser.add_option('-c', '--svm', dest='svm', default=False,
                      action='store_true', help='use SVM instead of Naive Bayes for training')
    (options, args) = parser.parse_args()

    ## get test file
    if len(args) > 0:
        options.test = args[0]
        if not os.path.isfile(options.test): sbd_util.die('test path [%s] does not exist' %options.test)
    else:
        options.test = None
        if not options.train: sbd_util.die('you did not specify either train or test!')

    ## create model path
    if not options.model_path.endswith('/'): options.model_path += '/'
    if options.train:
        if not os.path.isfile(options.train): sbd_util.die('model path [%s] does not exist' %options.train)
        if os.path.isdir(options.model_path): sbd_util.die('model path [%s] already exists' %options.model_path)
        else: os.mkdir(options.model_path)
    else:
        if not os.path.isdir(options.model_path):
            options.model_path = install_root + options.model_path
            if not os.path.isdir(options.model_path):
                sbd_util.die('model path [%s] does not exist' %options.model_path)
Beispiel #11
0
def poo():

    ## labeled data
    data_root = '/u/dgillick/workspace/sbd/'
    brown_data = data_root + 'whiskey/brown.1'
    wsj_data = data_root + 'whiskey/satz.1'
    poe_data = data_root + 'whiskey/poe.1'
    new_wsj_data = data_root + 'whiskey/wsj.1'

    ## install root
    install_root = '/home/chonger/Downloads/splitta/'

    ## options
    from optparse import OptionParser
    usage = 'usage: %prog [options] <text_file>'
    parser = OptionParser(usage=usage)
    parser.add_option('-v',
                      '--verbose',
                      dest='verbose',
                      default=False,
                      action='store_true',
                      help='verbose output')
    parser.add_option('-t',
                      '--tokenize',
                      dest='tokenize',
                      default=False,
                      action='store_true',
                      help='write tokenized output')
    parser.add_option('-m',
                      '--model',
                      dest='model_path',
                      type='str',
                      default='model_nb',
                      help='model path')
    parser.add_option('-o',
                      '--output',
                      dest='output',
                      type='str',
                      default=None,
                      help='write sentences to this file')
    parser.add_option('-x',
                      '--train',
                      dest='train',
                      type='str',
                      default=None,
                      help='train a new model using this labeled data file')
    parser.add_option('-c',
                      '--svm',
                      dest='svm',
                      default=False,
                      action='store_true',
                      help='use SVM instead of Naive Bayes for training')
    (options, args) = parser.parse_args()

    ## get test file
    if len(args) > 0:
        options.test = args[0]
        if not os.path.isfile(options.test):
            sbd_util.die('test path [%s] does not exist' % options.test)
    else:
        options.test = None
        if not options.train:
            sbd_util.die('you did not specify either train or test!')

    ## create model path
    if not options.model_path.endswith('/'): options.model_path += '/'
    if options.train:
        if not os.path.isfile(options.train):
            sbd_util.die('model path [%s] does not exist' % options.train)
        if os.path.isdir(options.model_path):
            sbd_util.die('model path [%s] already exists' % options.model_path)
        else:
            os.mkdir(options.model_path)
    else:
        if not os.path.isdir(options.model_path):
            options.model_path = install_root + options.model_path
            if not os.path.isdir(options.model_path):
                sbd_util.die('model path [%s] does not exist' %
                             options.model_path)

    ## create a model
    if options.train:
        model = build_model(options.train, options)

    if not options.test: sys.exit()

    print options.svm
    print options.test

    ## test
    if not options.train:
        if 'svm' in options.model_path: options.svm = True
        model = load_sbd_model(options.model_path, options.svm)
    if options.output: options.output = open(options.output, 'w')

    test = get_data(options.test, tokenize=True)
    test.featurize(model, verbose=True)
    model.classify(test, verbose=True)
    print options.tokenize
    print options.output
    test.segment(use_preds=True,
                 tokenize=options.tokenize,
                 output=options.output)