コード例 #1
0
ファイル: task4_stask2.py プロジェクト: omarjnb/semeval
def train_and_trial(train_file, test_file, train_parse='', test_parse='', pickled=True, use_dep=False):
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(train_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(train_file)
        testd = XMLParser.create_exs(test_file)
    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    print "should really use better dictionary for sentence senti labels"
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    train_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in traind['orig']]

    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], train_parse, dictionary=True, iobs=True)
    chunker = ConsecutiveChunkTagger(zip(traind['iob'],traind['polarity']), senti_dictionary,
                                     train_sentiment, dep_parses)
    print "done training"
    test_sentiment = [senti_classify(sent, posi_words, negi_words) for sent in testd['orig']]
    dep_parses = [[]] * len(test_sentiment)
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(testd['iob'], test_parse, dictionary=True, iobs=True)
    results = []
    for i in range(len(test_sentiment)):
        results.append(chunker.parse((testd['iob'][i], test_sentiment[i], dep_parses[i])))
    return results
コード例 #2
0
ファイル: semevalTask4.py プロジェクト: omarjnb/semeval
def train_and_trial(trn_file,
                    test_file,
                    parse_file_train,
                    parse_file_test,
                    use_dep=False,
                    posit_lex_file='positive-words.txt',
                    nega_lex_file='negative-words.txt',
                    pickled=False):
    """ Train on the training file and test on the testing file
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(trn_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(trn_file)
        testd = XMLParser.create_exs(test_file)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file_train,
                                                         dictionary=True,
                                                         iobs=True)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(traind['iob'], testd['iob'], senti_dictionary,
                                 dep_parses)
    print "done training on %d examples" % len(traind['iob'])
    '''
    f = open('learned.pkl','wb')
    cPickle.dump(chunker,f)
    f.close()
    '''
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file_test,
                                                         dictionary=True,
                                                         iobs=True)

    guessed_iobs = chunker.evaluate([testd['iob'], dep_parses])
    ###semeval_util.compute_pr(testd['iob'], guessed_iobs)
    return guessed_iobs
コード例 #3
0
ファイル: semevalTask4.py プロジェクト: cindithompson/semeval
def train_and_test(filename, parse_file, use_deps=False,
                   posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt'):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    global use_dep_parse
    if use_deps:
        use_dep_parse = True
    traind = XMLParser.create_exs(filename)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    test_deps = []
    if use_dep_parse:
        test_deps = dep_parses[split_size:]
    #Liu not in use for now
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses)
    guessed_iobs = chunker.evaluate([test,test_deps])
    semeval_util.compute_pr(test, guessed_iobs)
コード例 #4
0
ファイル: semevalTask4.py プロジェクト: omarjnb/semeval
def train_and_test(filename,
                   parse_file,
                   use_deps=False,
                   posit_lex_file='positive-words.txt',
                   nega_lex_file='negative-words.txt'):
    """Creates an 80/20 split of the examples in filename,
    trains the chunker on 80%, and evaluates the learned chunker on 20%.
    """
    global use_dep_parse
    if use_deps:
        use_dep_parse = True
    traind = XMLParser.create_exs(filename)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file,
                                                         dictionary=True,
                                                         iobs=True)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = traind['iob'][:split_size]
    test = traind['iob'][split_size:]
    test_deps = []
    if use_dep_parse:
        test_deps = dep_parses[split_size:]
    #Liu not in use for now
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(train, test, senti_dictionary, dep_parses)
    guessed_iobs = chunker.evaluate([test, test_deps])
    semeval_util.compute_pr(test, guessed_iobs)
コード例 #5
0
ファイル: semevalTask4.py プロジェクト: omarjnb/semeval
def K_fold_train_and_test(filename,
                          parse_file,
                          use_dep=False,
                          posit_lex_file='positive-words.txt',
                          nega_lex_file='negative-words.txt',
                          k=5,
                          pickled=False):
    """Does K-fold cross-validation on the given filename
    """
    global use_dep_parse
    if use_dep:
        print "using dependency parses"
        use_dep_parse = True
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    dep_parses = traind['iob']
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'],
                                                         parse_file,
                                                         dictionary=True,
                                                         iobs=True)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_p, tot_r, tot_f1 = 0, 0, 0
    for train, test in kf:
        print "next fold, split size: %d/%d" % (len(train), len(test))
        #print train
        train_set = []
        test_set = []
        train_parse = []
        test_parse = []
        for i in train:
            train_set.append(traind['iob'][i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append(traind['iob'][i])
            test_parse.append(dep_parses[i])
        chunker = ConsecutiveChunker(train_set, test_set, senti_dictionary,
                                     train_parse)
        guesses = chunker.evaluate([test_set, test_parse])
        #print test_set
        #print guesses
        r, p, f = semeval_util.compute_pr(test_set, guesses)
        tot_p += p
        tot_r += r
        tot_f1 += f
    print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" % (tot_p / float(k), tot_r /
                                                   float(k), tot_f1 / float(k))
コード例 #6
0
ファイル: semevalTask4.py プロジェクト: cindithompson/semeval
def train_and_trial(trn_file, test_file, parse_file_train, parse_file_test, use_dep=False,
                    posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', pickled=False):
    """ Train on the training file and test on the testing file
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(trn_file, 'rb')
        traind = cPickle.load(f)
        f.close()
        f = open(test_file, 'rb')
        testd = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(trn_file)
        testd = XMLParser.create_exs(test_file)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_train, dictionary=True, iobs=True)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    chunker = ConsecutiveChunker(traind['iob'], testd['iob'], senti_dictionary, dep_parses)
    print "done training on %d examples" % len(traind['iob'])
    '''
    f = open('learned.pkl','wb')
    cPickle.dump(chunker,f)
    f.close()
    '''
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file_test, dictionary=True, iobs=True)

    guessed_iobs = chunker.evaluate([testd['iob'], dep_parses])
    ###semeval_util.compute_pr(testd['iob'], guessed_iobs)
    return guessed_iobs
コード例 #7
0
ファイル: task4_stask2.py プロジェクト: omarjnb/semeval
def k_fold(filename, parse_filename, k=5, pickled=True, use_dep=False):
    global use_dep_parse
    if use_dep:
        use_dep_parse = True

    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    posi_words = semeval_util.get_liu_lexicon('positive-words.txt')
    negi_words = semeval_util.get_liu_lexicon('negative-words.txt')
    senti_dictionary = semeval_util.get_mpqa_lexicon()

    full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']]
    dep_parses = [[]] * n
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_filename, dictionary=True, iobs=True)
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_acc = 0.
    for train, test in kf:
        print "next fold, split size: %d/%d" %(len(train), len(test))
        #print train
        train_set = []
        train_sentis = []
        train_parse = []

        test_set = []
        test_sentis = []
        test_parse = []
        for i in train:
            train_set.append((traind['iob'][i], traind['polarity'][i]))
            train_sentis.append(full_senti_label[i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append((traind['iob'][i], traind['polarity'][i]))
            test_sentis.append((full_senti_label[i]))
            test_parse.append(dep_parses[i])
        chunker = ConsecutiveChunkTagger(train_set, senti_dictionary, train_sentis, train_parse)
        acc = chunker.evaluate(zip(test_set, test_sentis, test_parse))
        print "acc:", acc
        tot_acc += acc
    print "average acc:", tot_acc/k
コード例 #8
0
ファイル: semevalTask4.py プロジェクト: cindithompson/semeval
def K_fold_train_and_test(filename, parse_file, use_dep=False,
                          posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt', k=5, pickled=False):
    """Does K-fold cross-validation on the given filename
    """
    global use_dep_parse
    if use_dep:
        print "using dependency parses"
        use_dep_parse = True
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    dep_parses = traind['iob']
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
    #posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    #negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()
    kf = cross_validation.KFold(n, n_folds=k, indices=True)
    tot_p, tot_r, tot_f1 = 0, 0, 0
    for train, test in kf:
        print "next fold, split size: %d/%d" %(len(train), len(test))
        #print train
        train_set = []
        test_set = []
        train_parse = []
        test_parse = []
        for i in train:
            train_set.append(traind['iob'][i])
            train_parse.append(dep_parses[i])
        for i in test:
            test_set.append(traind['iob'][i])
            test_parse.append(dep_parses[i])
        chunker = ConsecutiveChunker(train_set, test_set, senti_dictionary, train_parse)
        guesses = chunker.evaluate([test_set, test_parse])
        #print test_set
        #print guesses
        r, p, f = semeval_util.compute_pr(test_set, guesses)
        tot_p += p
        tot_r += r
        tot_f1 += f
    print "ave Prec: %.2f, Rec: %.2f, F1: %.2f" %(tot_p/float(k), tot_r/float(k), tot_f1/float(k))
コード例 #9
0
ファイル: task4_stask2.py プロジェクト: omarjnb/semeval
def train_and_test(filename, parse_file,
                   posit_lex_file='positive-words.txt', nega_lex_file='negative-words.txt',
                   pickled=False, use_dep=False):
    """Creates an 80/20 split of the examples in filename,
    trains the sentiment classifier on 80%, and evaluates the learned classifier on 20%.
    """
    global use_dep_parse
    if use_dep:
        use_dep_parse = True
    if pickled:
        f = open(filename, 'rb')
        traind = cPickle.load(f)
        f.close()
    else:
        traind = XMLParser.create_exs(filename)
    n = len(traind['iob'])
    split_size = int(n * 0.8)
    train = zip(traind['iob'][:split_size], traind['polarity'][:split_size])
    test = zip(traind['iob'][split_size:], traind['polarity'][split_size:])
    posi_words = semeval_util.get_liu_lexicon(posit_lex_file)
    negi_words = semeval_util.get_liu_lexicon(nega_lex_file)
    senti_dictionary = semeval_util.get_mpqa_lexicon()

    full_senti_label = [senti_classify(sentence, posi_words, negi_words) for sentence in traind['orig']]

    dep_parses = []
    if use_dep_parse:
        dep_parses = semeval_util.add_dep_parse_features(traind['iob'], parse_file, dictionary=True, iobs=True)
        print "first dep_parse:", dep_parses[0]
        print "first train ex:", train[0]
        print "size parses all:", len(dep_parses), "vs train:", len(dep_parses[:split_size])

    chunker = ConsecutiveChunkTagger(train, senti_dictionary, full_senti_label, dep_parses[:split_size])
    print "done training"

    if use_dep_parse:
        dep_parses = dep_parses[split_size:]
        print "first test dep parse:", dep_parses[0]
        print "first test ex:", test[0]
    else:
        #artifact of using zip, even if not using parses, need to have same # of elements in all lists
        dep_parses = [[]] * split_size
    print chunker.evaluate(zip(test, full_senti_label[split_size:], dep_parses))