Beispiel #1
0
def train_and_trial(train_file, test_file, train_parse='', test_parse='', pickled=True, use_dep=False):
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(train_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(train_file)
        testd = XMLParser.create_exs(test_file)
    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    print "should really use better dictionary for sentence senti labels"
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    train_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in traind['orig']]

    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], train_parse, dictionary=True, iobs=True)
    chunker = ConsecutiveChunkTagger(zip(traind['iob'],traind['polarity']), senti_dictionary,
                                     train_sentiment, dep_parses)
    print "done training"
    test_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in testd['orig']]
    dep_parses = [[]] * len(test_sentiment)
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(testd['iob'], test_parse, dictionary=True, iobs=True)
    results = []
    for i in range(len(test_sentiment)):
        results.append(chunker.parse((testd['iob'][i], test_sentiment[i], dep_parses[i])))
    return results
Beispiel #2
0
def train_and_trial(trn_file,
                    test_file,
                    clf,
                    posit_lex_file='positive-words.txt',
                    nega_lex_file='negative-words.txt',
                    pickled=False):
    """ Train on the training file and test on the testing file,
    given a classifier, for the aspect extraction task.
    """
    if pickled:
        f = open(trn_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(trn_file)
        testd = XMLParser.create_exs(test_file)
    posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    #chunker = ConsecutiveChunker(traind['iob'], senti_dictionary)
    chunker = clf.train(traind['iob'], senti_dictionary)
    print "done training"

    guessed_iobs = chunker.evaluate(testd['iob'])
    XMLParser.create_xml(testd['orig'], guessed_iobs, testd['id'],
                         testd['idx'], 'trial_answers.xml')
    compute_pr(testd['iob'], guessed_iobs)
Beispiel #3
0
def train_and_trial(trn_file,
                    test_file,
                    parse_file_train,
                    parse_file_test,
                    use_dep=False,
                    posit_lex_file='positive-words.txt',
                    nega_lex_file='negative-words.txt',
                    pickled=False):
    """ Train on the training file and test on the testing file
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(trn_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(trn_file)
        testd = XMLParser.create_exs(test_file)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file_train,
                                                         dictionary=True,
                                                         iobs=True)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(traind['iob'], testd['iob'], senti_dictionary,
                                 dep_parses)
    print "done training on %d examples" % len(traind['iob'])
    '''
    f = open('learned.pkl','wb')
    cPickle.dump(chunker,f)
    f.close()
    '''
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file_test,
                                                         dictionary=True,
                                                         iobs=True)

    guessed_iobs = chunker.evaluate([testd['iob'], dep_parses])
    ###semeval_util.compute_pr(testd['iob'], guessed_iobs)
    return guessed_iobs
Beispiel #4
0
def add_dep_parse_features(original,
                           parse_file,
                           pickled=True,
                           dictionary=False,
                           iobs=False):
    """Create the dependency tree dictionaries that we need for each sentence
    in the input corpus.
    Inputs:
    original: pickled version of our dictionary, or the dictionary itself,
    or the original XML file
    """
    if pickled and not dictionary:
        f = open(original, 'rb')
        traind = cPickle.load(f)
        f.close()
    elif dictionary:
        traind = original
    else:
        traind = XMLParser.create_exs(original)
    f = open(parse_file, 'r')
    lines = f.readlines()
    f.close()
    dep_trees = transform_dep_parse(lines)
    senti_dictionary = get_mpqa_lexicon()
    if iobs:
        new_dep_trees = integrate_dep_iob(traind, dep_trees, senti_dictionary)
    else:
        new_dep_trees = integrate_dep_iob(traind['iob'], dep_trees,
                                          senti_dictionary)
    return new_dep_trees
Beispiel #5
0
def train_and_test(filename, parse_file, use_deps=False,
                   posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt'):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    global use_dep_parse
    if use_deps:
        use_dep_parse = True
    traind = XMLParser.create_exs(filename)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    test_deps = []
    if use_dep_parse:
        test_deps = dep_parses[split_size:]
    #Liu not in use for now
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses)
    guessed_iobs = chunker.evaluate([test,test_deps])
    semeval_util.compute_pr(test, guessed_iobs)
Beispiel #6
0
def train_and_test(filename,
                   parse_file,
                   use_deps=False,
                   posit_lex_file='positive-words.txt',
                   nega_lex_file='negative-words.txt'):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    global use_dep_parse
    if use_deps:
        use_dep_parse = True
    traind = XMLParser.create_exs(filename)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file,
                                                         dictionary=True,
                                                         iobs=True)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    test_deps = []
    if use_dep_parse:
        test_deps = dep_parses[split_size:]
    #Liu not in use for now
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses)
    guessed_iobs = chunker.evaluate([test, test_deps])
    semeval_util.compute_pr(test, guessed_iobs)
Beispiel #7
0
def K_fold_train_and_test(filename, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', k=2, pickled=False):
    """Does K-fold cross-validation on the given filename
    """
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    #posi_words = get_liu_lexicon(posit_lex_file)
    #negi_words = get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_p, tot_r, tot_f1 = 0, 0, 0
    for train, test in kf:
        print "next fold, split size: %d/%d" %(len(train), len(test))
        #print train
        train_set = []
        test_set = []
        for i in train:
            train_set.append(traind['iob'][i])
        for i in test:
            test_set.append(traind['iob'][i])
        chunker = ConsecutiveChunker(train_set, senti_dictionary)
        guesses = chunker.evaluate(test_set)
        print test_set
        print guesses
        r, p, f = semeval_util.compute_pr(test_set, guesses)
        tot_p += p
        tot_r += r
        tot_f1 += f
    print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" %(tot_p/float(k), tot_r/float(k), tot_f1/float(k))
Beispiel #8
0
def create_parses_from_dict(input, ofile='dep_parse.txt', pickled=True):
    if pickled:
        f = open(input, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(input)
    return stanford_parse(traind['orig'], ofile)
Beispiel #9
0
def K_fold_train_and_test(filename,
                          parse_file,
                          use_dep=False,
                          posit_lex_file='positive-words.txt',
                          nega_lex_file='negative-words.txt',
                          k=5,
                          pickled=False):
    """Does K-fold cross-validation on the given filename
    """
    global use_dep_parse
    if use_dep:
        print "using dependency parses"
        use_dep_parse = True
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    dep_parses = traind['iob']
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file,
                                                         dictionary=True,
                                                         iobs=True)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_p, tot_r, tot_f1 = 0, 0, 0
    for train, test in kf:
        print "next fold, split size: %d/%d" % (len(train), len(test))
        #print train
        train_set = []
        test_set = []
        train_parse = []
        test_parse = []
        for i in train:
            train_set.append(traind['iob'][i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append(traind['iob'][i])
            test_parse.append(dep_parses[i])
        chunker = ConsecutiveChunker(train_set, test_set, senti_dictionary,
                                     train_parse)
        guesses = chunker.evaluate([test_set, test_parse])
        #print test_set
        #print guesses
        r, p, f = semeval_util.compute_pr(test_set, guesses)
        tot_p += p
        tot_r += r
        tot_f1 += f
    print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" % (tot_p / float(k), tot_r /
                                                   float(k), tot_f1 / float(k))
Beispiel #10
0
def train_and_trial(trn_file, test_file, parse_file_train, parse_file_test, use_dep=False,
                    posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False):
    """ Train on the training file and test on the testing file
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(trn_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(trn_file)
        testd = XMLParser.create_exs(test_file)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_train, dictionary=True, iobs=True)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(traind['iob'], testd['iob'], senti_dictionary, dep_parses)
    print "done training on %d examples" % len(traind['iob'])
    '''
    f = open('learned.pkl','wb')
    cPickle.dump(chunker,f)
    f.close()
    '''
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_test, dictionary=True, iobs=True)

    guessed_iobs = chunker.evaluate([testd['iob'], dep_parses])
    ###semeval_util.compute_pr(testd['iob'], guessed_iobs)
    return guessed_iobs
Beispiel #11
0
def k_fold(filename, parse_filename, k=5, pickled=True, use_dep=False):
    global use_dep_parse
    if use_dep:
        use_dep_parse = True

    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    senti_dictionary = semeval_util.get_mpqa_lexicon()

    full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']]
    dep_parses = [[]] * n
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_filename, dictionary=True, iobs=True)
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_acc = 0.
    for train, test in kf:
        print "next fold, split size: %d/%d" %(len(train), len(test))
        #print train
        train_set = []
        train_sentis = []
        train_parse = []

        test_set = []
        test_sentis = []
        test_parse = []
        for i in train:
            train_set.append((traind['iob'][i], traind['polarity'][i]))
            train_sentis.append(full_senti_label[i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append((traind['iob'][i], traind['polarity'][i]))
            test_sentis.append((full_senti_label[i]))
            test_parse.append(dep_parses[i])
        chunker = ConsecutiveChunkTagger(train_set, senti_dictionary, train_sentis, train_parse)
        acc = chunker.evaluate(zip(test_set, test_sentis, test_parse))
        print "acc:", acc
        tot_acc += acc
    print "average acc:", tot_acc/k
Beispiel #12
0
def K_fold_err_analysis(filename, parse_file, k=5, p=0.15, pickled=False):
    """Does K-fold cross-validation on the given filename, but only p percent of it
    """

    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    #####
    n = int(len(traind['iob']) * p)
    dep_parses = traind['iob']
    #if use_dep_parse:
    #    dep_parses = add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
    senti_dictionary = get_mpqa_lexicon()
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_p, tot_r, tot_f1 = 0, 0, 0
    for train, test in kf:
        print "next fold, split size: %d/%d" % (len(train), len(test))
        #print train
        train_set = []
        test_set = []
        train_parse = []
        test_parse = []
        for i in train:
            train_set.append(traind['iob'][i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append(traind['iob'][i])
            test_parse.append(dep_parses[i])
        chunker = semevalTask4.ConsecutiveChunker(train_set, test_set,
                                                  senti_dictionary,
                                                  train_parse)
        guesses = chunker.evaluate([test_set, test_parse])
        r, p, f = compute_pr(test_set, guesses)
        tot_p += p
        tot_r += r
        tot_f1 += f
        #JUST ONE SPLIT FOR NOW!!!
        return
    print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" % (tot_p / float(k), tot_r /
                                                   float(k), tot_f1 / float(k))
Beispiel #13
0
def train_and_test(filename, parse_file,
                   posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt',
                   pickled=False, use_dep=False):
    """Creates an 80/20 split of the examples in filename,
    trains the sentiment classifier on 80%, and evaluates the learned classifier on 20%.
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = zip(traind['iob'][:split_size], traind['polarity'][:split_size])
    test = zip(traind['iob'][split_size:], traind['polarity'][split_size:])
    posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()

    full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']]

    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
        print "first dep_parse:", dep_parses[0]
        print "first train ex:", train[0]
        print "size parses all:", len(dep_parses), "vs train:", len(dep_parses[:split_size])

    chunker = ConsecutiveChunkTagger(train, senti_dictionary, full_senti_label, dep_parses[:split_size])
    print "done training"

    if use_dep_parse:
        dep_parses = dep_parses[split_size:]
        print "first test dep parse:", dep_parses[0]
        print "first test ex:", test[0]
    else:
        #artifact of using zip, even if not using parses, need to have same # of elements in all lists
        dep_parses = [[]] * split_size
    print chunker.evaluate(zip(test, full_senti_label[split_size:], dep_parses))
Beispiel #14
0
def train_and_test(filename, posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    #posi_words = get_liu_lexicon(posit_lex_file)
    #negi_words = get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker()
    chunker.train(train, senti_dictionary)
    guessed_iobs = chunker.evaluate(test)
    semeval_util.compute_pr(test, guessed_iobs)
Beispiel #15
0
def train_and_test(filename,
                   posit_lex_file='positive-words.txt',
                   nega_lex_file='negative-words.txt',
                   pickled=False):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    #posi_words = get_liu_lexicon(posit_lex_file)
    #negi_words = get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker()
    chunker.train(train, senti_dictionary)
    guessed_iobs = chunker.evaluate(test)
    semeval_util.compute_pr(test, guessed_iobs)