Exemple #1
0
def feature_keys(ind):
    """This function must be called if one wants to get the words used in the
       feature vectors for the training data.

       Upon supplying an index (use the instance id), it returns a list of words.
       This is the keys that are used in the cooccurrence feature vectors."""

    # check to see if the feature vector we are looking for exists
    try:
        return cooccur_vect[ind]
    except KeyError:
        # does not exist, we have to make it
        # first we must make a list of common words using this sense data
        word_list = []
        heads = []
        for instance in senseval.instances(ind):
            # first load the cases for item and get the most common words
            heads = heads + [instance.context[instance.position].lower()]
            word_list.extend([x.lower() for x in instance.context])
            
        # remove the stop words
        for x in stopwords + heads:
            word_list = filter(lambda w: w != x, word_list)
        
        word_counts = {}
        for word in word_list:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1

        # sort the list in descending order and truncate to get most common
        cooccur_vect[ind] = sorted(word_counts, key = word_counts.get, reverse = True)
        cooccur_vect[ind] = cooccur_vect[ind][:vectorSize]
    return cooccur_vect[ind]
Exemple #2
0
def senses(word):
    """
    This takes a target word from senseval-2 (find out what the possible
    are by running senseval.fileides()), and it returns the list of possible 
    senses for the word
    """
    return list(set(i.senses[0] for i in senseval.instances(word)))
Exemple #3
0
def wsd_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False):
    
    print "Reading data..."
    global _inst_cache
    if word not in _inst_cache:
        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
    events = _inst_cache[word][:]
    senses = list(set(l for (i, l) in events))
    instances = [i for (i, l) in events]
    vocab = extract_vocab(instances, stopwords=stopwords_list, n=number)
    print ' Senses: ' + ' '.join(senses)

    # Split the instances into a training and test set,
    #if n > len(events): n = len(events)
    n = len(events)
    random.seed(5444522)
    random.shuffle(events)
    training_data = events[:int(0.8 * n)]
    test_data = events[int(0.8 * n):n]
    # Train classifier
    print 'Training classifier...'
    classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data])
    # Test classifier
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] )
    print 'Accuracy: %6.4f' % acc
    if log==True:
        #write error file
        print 'Writing errors to errors.txt'
        output_error_file = open('errors.txt', 'w')
        errors = []
        for (i, label) in test_data:
            guess = classifier.classify(features(i, vocab, distance))
            if guess != label:
                con =  i.context
                position = i.position
                item_number = str(test_data.index((i, label)))
                word_list = []
                for (word, tag) in con:
                    word_list.append(word)
                hard_highlighted = word_list[position].upper()
                word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:]
                sentence = ' '.join(word_list_highlighted)
                errors.append([item_number, sentence, guess,label])
        error_number = len(errors)
        output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' +
                                '\n' + '\n')
        for error in errors:
            output_error_file.write(str(errors.index(error)+1) +') ' + 'example number: ' + error[0] + '\n' +
                                    '    sentence: ' + error[1] + '\n' +
                                    '    guess: ' + error[2] + ';  label: ' + error[3] + '\n' + '\n')
        output_error_file.close()
    if confusion_matrix==True:
        gold = [label for (i, label) in test_data]
        derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data]
        cm = nltk.ConfusionMatrix(gold,derived)
        print cm
        return cm
Exemple #4
0
def test(test_iter, folds, training_folds):
    from nltk.corpus import senseval
    results = []
    for i in range(test_iter):
        print "iteration %d ..." % (i + 1)
        ini_set = split_set(folds, senseval.instances())
        for j in range(folds):
            print"...fold %d ..." % (j + 1)
            set = partition_set(training_folds, ini_set, j)
            trndict = get_most_sensed_list(set[0])
            results.append(most_sensed_checked(trndict, set[1]))
    return results
Exemple #5
0
def sense_anal(word):
    print "word: ", word
    senses = []
    for inst in seval.instances(word):
        senses += inst.senses
    print 'senses: ', set(senses)
    print "sentences: ", len(senses)

    borders = [(0, senses[0])]
    for i in range(1, len(senses)):
        if senses[i] != senses[i-1]:
            borders.append((i, senses[i]))
    print "borders: ", borders
Exemple #6
0
def test(test_iter, folds, training_folds):
    results = []
    mem = Memory()
    for i in range(test_iter):
        print "iteration %d ..." % (i + 1)
        ini_set = split_set2(folds, senseval.instances()[0:])
        for j in range(folds):
            print"...fold %d ..." % (j + 1)
            sets = partition_set(training_folds, ini_set, j)
            print "-$$Train time$$-"
            mem.train(sets[0])
            print "-$$results time$$-"
            results.append(mem.test(sets[1]))
    return results
Exemple #7
0
def test_main():
    mem = Memory()
    print "loading data_set"
    ini_set = split_set2(5, senseval.instances()[0:10000])
    data_set = partition_set(4, ini_set, 0)
    #Serializer.save("/tmp/portioned_data", data_set)
    #data_set = Serializer.load("/tmp/portioned_data")
    print "training data"
    mem.train(data_set[0])
    #print "saving data"
    #mem.save_values("/tmp/mem_internals")
    #mem.load_values("/tmp/mem_internals")
    print "------*********testing**********------"
    results = mem.test(data_set[1])
    print "%3.1f %% accuracy" %(sum(results)/len(results) * 100)
Exemple #8
0
def wsd_demo(trainer, word, features, n=1000):
    from nltk.corpus import senseval
    import random

    # Get the instances.
    print('Reading data...')
    global _inst_cache
    if word not in _inst_cache:
        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
    instances = _inst_cache[word][:]
    if n > len(instances):
        n = len(instances)
    senses = list(set(l for (i, l) in instances))
    print('  Senses: ' + ' '.join(senses))

    # Randomly split the names into a test & train set.
    print('Splitting into test & train...')
    random.seed(123456)
    random.shuffle(instances)
    train = instances[:int(.8*n)]
    test = instances[int(.8*n):n]

    # Train up a classifier.
    print('Training classifier...')
    classifier = trainer([(features(i), l) for (i, l) in train])

    # Run the classifier on the test data.
    print('Testing classifier...')
    acc = accuracy(classifier, [(features(i), l) for (i, l) in test])
    print('Accuracy: %6.4f' % acc)

    # For classifiers that can find probabilities, show the log
    # likelihood and some sample probability distributions.
    try:
        test_featuresets = [features(i) for (i, n) in test]
        pdists = classifier.prob_classify_many(test_featuresets)
        ll = [pdist.logprob(gold)
              for ((name, gold), pdist) in zip(test, pdists)]
        print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test)))
    except NotImplementedError:
        pass

    # Return the classifier
    return classifier
Exemple #9
0
def batch_classify(items, tests):
	senses = []
	for item in items:
		print >> sys.stderr, "classifying %s" % item
		lexitem = ".".join(item.split(".")[0:2])
		trains=\
			[dict(context=instance.context,\
				position=instance.position,\
				senses=instance.senses)\
			for instance in senseval.instances(item)]
		train=build_train(item, trains)
		test=build_test(item, tests[lexitem])

		# TODO(astory): make dynamic?
		for i in range(BOOTSTRAP_REPS):
			classified = classify(train,test)
			train = bootstrap(train, test, classified)

		senses.extend(classify(train,test))
	return senses
Exemple #10
0
    for item in b:
        key= "fol"+str(b.index(item)+1)+"-word"
        value= item
        dictionary[key]=value
        key= "fol"+str(b.index(item)+1)+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    return dictionary
        
if __name__=="__main__":
    for item in items:
        totalResult= []
        windowSize=4
        dictionary={}
        for instance in senseval.instances(item)[:10]:
                pos = instance.position
                context = instance.context
                senses = instance.senses
                #print context
                #print context[pos]
                d= colocation(windowSize, pos, context,dictionary)
                print d
                    
		


    


    
Exemple #11
0
def wst_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False):
    """
    This function takes as arguments:
        a trainer (e.g., NaiveBayesClassifier.train);
        a target word from senseval2 (you can find these out with senseval.fileids(),
            and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos');
        a feature set (this can be wsd_context_features or wsd_word_features);
        a number (defaults to 300), which determines for wsd_word_features the number of
            most frequent words within the context of a given sense that you use to classify examples;
        a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then
            wsd_context_features gives 3 words and tags to the left and 3 words and tags to
            the right of the target word);
        log (defaults to false), which if set to True outputs the errors into a file errors.txt
        confusion_matrix (defaults to False), which if set to True prints a confusion matrix.

    Calling this function splits the senseval data for the word into a training set and a test set (the way it does
    this is the same for each call of this function, because the argument to random.seed is specified,
    but removing this argument would make the training and testing sets different each time you build a classifier).

    It then trains the trainer on the training set to create a classifier that performs WSD on the word,
    using features (with number or distance where relevant).

    It then tests the classifier on the test set, and prints its accuracy on that set.

    If log==True, then the errors of the classifier over the test set are written to errors.txt.
    For each error four things are recorded: (i) the example number within the test data (this is simply the index of the
    example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the
    (incorrect) derived label, and (iv) the gold label.

    If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j]
    indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels
    that were correctly predicted).
    """
    print "Reading data..."
    global _inst_cache
    if word not in _inst_cache:
        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
    events = _inst_cache[word][:]
    senses = list(set(l for (i, l) in events))
    instances = [i for (i, l) in events]
    vocab = extract_vocab(instances, stopwords=stopwords_list, n=number)
    print ' Senses: ' + ' '.join(senses)

    # Split the instances into a training and test set,
    #if n > len(events): n = len(events)
    n = len(events)
    random.seed(5444522)
    random.shuffle(events)
    training_data = events[:int(0.8 * n)]
    test_data = events[int(0.8 * n):n]
    # Train classifier
    print 'Training classifier...'
    classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data])
    # Test classifier
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] )
    print 'Accuracy: %6.4f' % acc
    if log==True:
        #write error file
        print 'Writing errors to errors.txt'
        output_error_file = open('errors.txt', 'w')
        errors = []
        for (i, label) in test_data:
            guess = classifier.classify(features(i, vocab, distance))
            if guess != label:
                con =  i.context
                position = i.position
                item_number = str(test_data.index((i, label)))
                word_list = []
                for (word, tag) in con:
                    word_list.append(word)
                hard_highlighted = word_list[position].upper()
                word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:]
                sentence = ' '.join(word_list_highlighted)
                errors.append([item_number, sentence, guess,label])
        error_number = len(errors)
        output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' +
                                '\n' + '\n')
        for error in errors:
            output_error_file.write(str(errors.index(error)+1) +') ' + 'example number: ' + error[0] + '\n' +
                                    '    sentence: ' + error[1] + '\n' +
                                    '    guess: ' + error[2] + ';  label: ' + error[3] + '\n' + '\n')
        output_error_file.close()
    if confusion_matrix==True:
        gold = [label for (i, label) in test_data]
        derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data]
        cm = nltk.ConfusionMatrix(gold,derived)
        print cm
        return cm
Exemple #12
0
import random

import nltk
from nltk.corpus import senseval

instances = senseval.instances('hard.pos')
size = int(len(instances) * 0.1)
train_set, test_set = instances[size:], instances[:size]

"""for i in train_set:
    print(i.context)
    """


def features(instance):
    feat = dict()
    p = instance.position
    if p:
        feat['wp'] = instance.context[p - 1][0]
        feat['tp'] = instance.context[p - 1][1]
    else:  #
        feat['wp'] = (p, 'BOS')
        feat['tp'] = (p, 'BOS')
        feat['wf'] = instance.context[p + 1][0]
        feat['tf'] = instance.context[p + 1][1]
    return feat


featureset = [(features(i), i.senses[0]) for i in instances if len(i.senses) == 1]

print(featureset)
def wst_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=600, log=False, distance=3, confusion_matrix=False):
    """
    This function takes as arguments:
        a trainer (e.g., NaiveBayesClassifier.train);
        a target word from senseval2 (you can find these out with senseval.fileids(),
            and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos');
        a feature set (this can be wsd_context_features or wsd_word_features);
        a number (defaults to 300), which determines for wsd_word_features the number of
            most frequent words within the context of a given sense that you use to classify examples;
        a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then
            wsd_context_features gives 3 words and tags to the left and 3 words and tags to
            the right of the target word);
        log (defaults to false), which if set to True outputs the errors into a file errors.txt
        confusion_matrix (defaults to False), which if set to True prints a confusion matrix."""

    print "Reading data..."
    global _inst_cache
    #print "",senseval.instances(word)[0]
    if word not in _inst_cache:
        _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
    events = _inst_cache[word][:]
    senses = list(set(l for (i, l) in events))
    instances = [i for (i, l) in events]

    vocab = extract_vocab(instances, stopwords=stopwords_list, n=number)
    print ' Senses: ' + ' '.join(senses)

    # Split the instances into a training and test set,
    #if n > len(events): n = len(events)
    n = len(events)
    print n
    #random.seed(5444522)
    random.shuffle(events)
    training_data = events[:int(0.9 * n)]
    test_data = events[int(0.1 * n):n]
    path = "test_data_"+ word +".tsv"
    
    #creating test data for similarity algo
    with open(path,"w") as f:
        for (text,label) in test_data:
            sentences = ""
            for wordTag in text.context:
                if len(wordTag)==2:
                    word,tag = wordTag
                    sentences +=word +" "
            f.write(""+label+"\t"+sentences +"\n")
    
    startSimilarity(path)

    # Train classifier
    print 'Training classifier...'
    #print training_data[0]
    classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data])
    # Test classifier
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] )
    print 'Accuracy: %6.4f' % acc
    
    if confusion_matrix==True:
        gold = [label for (i, label) in test_data]
        derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data]
        #print derived
        cm = nltk.ConfusionMatrix(gold,derived)
        print "Machine Learning Confusion-Matrix" 
        print cm
        return cm
    #wst_classifier(NaiveBayesClassifier.train, 'line.pos', wsd_word_features,distance=3, confusion_matrix=True)
    #wst_classifier(NaiveBayesClassifier.train, 'serve.pos', wsd_word_features,distance=3, confusion_matrix=True)

    """wst_classifier(NaiveBayesClassifier.train, 'interest.pos', wsd_context_features,distance=3, confusion_matrix=True)
    wst_classifier(NaiveBayesClassifier.train, 'hard.pos', wsd_context_features,distance=3, confusion_matrix=True)
    wst_classifier(NaiveBayesClassifier.train, 'line.pos', wsd_context_features,distance=3, confusion_matrix=True)
    wst_classifier(NaiveBayesClassifier.train, 'serve.pos', wsd_context_features,distance=3, confusion_matrix=True)"""
    

    # logistic regression ===  max Entropy classifier
    wst_classifier(MaxentClassifier.train, 'interest.pos', wsd_word_features,distance=3, confusion_matrix=True)
    """wst_classifier(MaxentClassifier.train, 'hard.pos', wsd_word_features,distance=3, confusion_matrix=True)
    wst_classifier(MaxentClassifier.train, 'line.pos', wsd_word_features,distance=3, confusion_matrix=True)
    wst_classifier(MaxentClassifier.train, 'serve.pos', wsd_word_features,distance=3, confusion_matrix=True)"""
   
    
    """wst_classifier(MaxentClassifier.train, 'interest.pos', wsd_context_features,distance=3, confusion_matrix=True)
    wst_classifier(MaxentClassifier.train, 'hard.pos', wsd_context_features,distance=3, confusion_matrix=True)
    wst_classifier(MaxentClassifier.train, 'line.pos', wsd_context_features,distance=3, confusion_matrix=True)
    wst_classifier(MaxentClassifier.train, 'serve.pos', wsd_context_features,distance=3, confusion_matrix=True)"""
    

start()

# Frequency Baseline
sense_fd = nltk.FreqDist([i.senses[0] for i in senseval.instances('hard.pos')])
most_frequent_sense = sense_fd.keys()[0]
frequency_sense_baseline = sense_fd.freq(sense_fd.keys()[0])
print "frequency baseline:" ,frequency_sense_baseline
##0.79736902838679902
Exemple #15
0
def create_labeled_data():
    # collect all data from the corpus
    interest = senseval.instances('interest.pos')
    # create labeled data
    labeled_data = ...
    return labeled_data
Exemple #16
0
sense = {
    'interest_1': 0,
    'interest_2': 1,
    'interest_3': 2,
    'interest_4': 3,
    'interest_5': 4,
    'interest_6': 5
}

bayes = [[], [], [], [], [], []]

count = [0, 0, 0, 0, 0, 0]

n = 0

for instance in senseval.instances('interest.pos')[0:1599]:
    count[sense[instance.senses[0]]] += 1
    sentence = ' '.join(w for (w, p) in instance.context)
    parsed = list(parser.parse(tokenizer.tokenize(sentence)))
    for triple in parsed[0].triples():
        related = 0
        if triple[0][0] in interest:
            word = triple[2][0]
            related = 1
        if triple[2][0] in interest:
            word = triple[0][0]
            related = 1
        if related == 1:
            exist = 0
            for item in bayes[sense[instance.senses[0]]]:
                if item[0] == word:
Exemple #17
0
def get_category(pos):
    
    category = []
    for inst in senseval.instances(pos):
        category.append(inst.senses)
    return category
Exemple #18
0
    'interest_4': 3,
    'interest_5': 4,
    'interest_6': 5
}

bayes = pickle.load(open('bayes_bag.txt', 'r'))

count = pickle.load(open('count.txt', 'r'))

correct = 0

base = 0

n = 0

for instance in senseval.instances('interest.pos')[1600:2000]:
    score = []
    for num in count[0:6]:
        score.append(math.log(num / 1600.0))

    p = instance.position + 2

    sentence = list(['<BOS1>', '<BOS2>'])
    for word in instance.context:
        sentence.append(word[0])
    sentence.append('<EOS1>')
    sentence.append('<EOS2>')

    bag = [sentence[p - 2], sentence[p - 1], sentence[p + 1], sentence[p + 2]]

    for word in bag:
Exemple #19
0
    for sample in samples:
        context_index = sample.context
        freq_dict = Counter(context_index)
        tuples = [tuple([x, y]) for x, y in freq_dict.items()]
        ids, counts = zip(*tuples)

        C[iter, ids] = counts
        iter += 1

    return C


if __name__ == "__main__":

    t0 = time.time()
    instances = senseval.instances(hard_f)
    # all training samples as a list
    samples = [sample(inst) for inst in instances]

    # V is size of Vocab, K is number of clusters
    word_to_id, V, K = create_vocab(samples)

    # convert contexts to indices so they can be used for indexing
    for sample in samples:
        sample.context_to_index(word_to_id)

    # initialize vj|s, priors
    EM = EM(V, K)

    # C is a sample_size * vocab_size matrix
    C = counts_matrix(samples, V)
Exemple #20
0
NO_STOPWORDS = []


print "Came here"
""" Get POS tag based features """
#POS = set()

#POS_DICT = {'': 0, 'PRP$': 1, 'VBG': 2, 'VBD': 3, 'VB': 26, "''": 5, 'VBP': 6, 'WDT': 7, 'JJ': 8, 'WP': 9, 'VBZ': 10, 'DT': 11, '"': 12, 'RP': 13, '$': 14, 'NN': 15, '(': 16, 'FW': 17, 'POS': 18, '.': 19, 'TO': 20, 'PRP': 21, 'RB': 22, ':': 23, 'NNS': 24, 'NNP': 25, '``': 4, 'WRB': 27, 'CC': 28, 'PDT': 30, 'RBS': 31, 'RBR': 32, 'VBN': 33, 'R': 34, 'EX': 35, 'IN': 36, 'WP$': 37, 'CD': 38, 'MD': 39, 'NNPS': 40, 'h': 41, 'NNP ': 45, 'JJS': 42, 'JJR': 43, 'SYM': 44, 's': 29, 'UH': 46, 'VBP ': 47}

#hard_rtnsl = ['through', u'shoe', 'skin', 'find', 'ground', u'discipline', u'ha', 'had', 'to', 'going', u'board', 'do', 'good', 'get', 'very', 'material', u'capsule', u'breast', 'day', 'people', u'seat', 'see', 'are', 'packed', 'out', 'even', 'for', 'crust', 'enough', 'between', 'red', 'be', 'wheat', 'dirt', 'imagine', 'carbide', 'come', 'on', 'stone', 'her', 'of', 'taking', 'keep', 'turn', 'place', 'cheese', 'into', u'one', 'down', 'fast', 'little', 'long', u'eye', 'would', 'been', 'plastic', 'much', 'way', 'taken', 'tell', u'shell', 'took', 'part', 'determination', u'line', 'believe', 'with', 'myself', 'look', 'this', 'science', 'up', 'making', u'feeling', 'study', 'is', 'surface', 'evidence', 'at', 'have', 'in', 'court', 'winter', 'no', 'make', 'reality', 'rubber', 'take', 'so', "'s", 'sided', 'enamel', 'coat', u'cover', u'face', 'edge', 'green', 'time', 'baked', 'having', u'fact', 'know']

#line_rtnls = ['taxi', 'walking', 'answering', 'deck', 'telecom', u'executive', 'hamlet', 'through', 'fishing', 'crowded', 'fine', 'profitable', 'cut', 'personal', 'lake', 'should', 'to', u'minute', u'joke', 'complaint', u'tourist', 'outside', 'food', 'jerking', 'five', 'drawn', u'walk', 'pier', 'bank', u'loss', 'like', 'cable', u'transmission', 'gender', 'motorboat', 'blurrier', 'specific', 'fisherman', 'crossing', 'hotline', 'river', 'side', 'clothes', 'draw', 'old', 'people', 'acquired', 'attached', 'fish', u'traveler', 'direct', 'blurred', u'computer', 'trading', 'are', u'sea', u'year', 'separating', 'laundry', 'racial', 'investment', 'network', 'for', 'waiting', 'profit', 'legal', 'access', 'written', 'blur', 'new', 'reading', 'across', 'blurry', 'be', u'telecommunication', 'business', 'exchange', 'sold', 'communicate', 'drew', 'water', 'busy', 'corp', 'snapped', 'along', 'by', 'tug', 'on', 'about', 'carried', 'jeep', 'of', 'industry', 'drag', 'against', 'bow', 'telesis', 'airport', 'tangled', 'stand', 'social', 'retail', 'first', 'co', 'bell', u'communication', 'into', 'private', 'one', 'hook', 'jammed', 'fast', '176', u'open', 'market', 'speak', 'standing', 'toy', 'from', 'tread', 'service', 'two', 'long', 'subscriber', 'pc', 'vax', 'call', u'vehicle', u'wait', 'checkout', u'store', 'more', 'flat', 'dialogue', 'selling', 'door', 'forming', 'company', 'formed', 'phone', 'understand', u'switchboard', 'catch', 'fastened', 'with', 'than', u'customer', 'novel', u'word', u'hour', 'these', u'car', 'non-art', u'caller', 'gasoline', 'up', u'rope', 'cast', 'crossed', 'thin', 'editorial', 'were', 'called', 'acquisition', 'toll-free', u'ad', 'toss', 'share', 'hauling', 'heard', u'say', 'pulling', 'at', 'have', 'in', 'ship', u'dealer', 'film', 'inc', 'sell', 'end', u'conversation', 'secured', 'get', 'brand', 'cross', u'actor', 'uttered', u'book', u'speech', 'catfish', 'switching', 'long-distance', u'product', 'exactly', "'s", u'price', 'ideological', u'hang', 'tied', 'tow', '000', 'pulled', 'delicate', 'such', 'blurring', 'single', 'off', 'third', 'largely', 'consumer', 'clear', u'sale', 'drawing', 'green', 'enter', 'apparel', 'buoy', 'corporate', 'divided', 'reserve']

print "Came here again"

instances1 = sense_instances(senseval.instances('hard.pos'), 'HARD1')
instances2 = sense_instances(senseval.instances('hard.pos'), 'HARD2')
instances3 = sense_instances(senseval.instances('hard.pos'), 'HARD3')

instances4 = sense_instances(senseval.instances('line.pos'), 'cord')
instances5 = sense_instances(senseval.instances('line.pos'), 'division')
instances6 = sense_instances(senseval.instances('line.pos'), 'formation')
instances7 = sense_instances(senseval.instances('line.pos'), 'product')
instances8 = sense_instances(senseval.instances('line.pos'), 'text')
instances9 = sense_instances(senseval.instances('line.pos'), 'phone')

lmtzr = WordNetLemmatizer()
print "Came here"

def modify_instance_with_CRFtag(index,filename,instances):
	j = index
#We built the model on train and test on dev_test_set and reworked the model until we had good
#results on both train and dev_test_set. However, that means we probably still overfit a bit to those two
#datasets. We would expect to perform slightly worse on the test_set and that is what happened.


#3 The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. It contains data for 
#four words: hard, interest, line, and serve. Choose one of these four words, and load the corresponding data:

#Using this dataset, build a classifier that predicts the correct sense tag for a given instance. See the corpus HOWTO 
#at http://nltk.org/howto for information on using the instance objects returned by the Senseval 2 Corpus.

import nltk
from nltk.corpus import senseval
import random
#I chose to use the word 'serve'
instances = senseval.instances('serve.pos')
size = int(len(instances) * 0.1)

for inst in instances[:5]:
    p = inst.position
    left = ' '.join(w for (w,t) in inst.context[p-2:p])
    word = ' '.join(w for (w,t) in inst.context[p:p+1])
    right = ' '.join(w for (w,t) in inst.context[p+1:p+3])
    senses = ' '.join(inst.senses)
    
def features(instance):
    feat = dict()
    p = instance.position
       ## previous word and tag
    if p: ## > 0
        feat['wp'] = instance.context[p-1][0]
Exemple #22
0
def split_corpus(train_p=0.8, sample_num=1100, sample_range=1100):

    logging.info("start corpus")
    logging.info("  restriction starts")

    # restrict corpora to 2 most common senses
    hard = seval.instances("hard.pos")[0:631]
    hard += seval.instances("hard.pos")[3455:3957]
    line = seval.instances("line.pos")[1096:2200]
    serve = seval.instances("serve.pos")[0:600]
    serve += seval.instances("serve.pos")[2486:3086]

    logging.info("  value setting starts")

    train = []
    test = {'hard': [], 'line': [], 'serve': []}
    labels = {'hard': [], 'line': [], 'serve': []}
    offsets = {'hard': [], 'line': [], 'serve': []}
    corpora = [hard, line, serve]
    samples = sample(range(sample_range), sample_num)  # random order for sentences
    border = int(sample_num * train_p)

    logging.info("  training samples start")

    # ambiguous words alterning to prevent skew
    for i in samples[:border]:
        for corp in corpora:
            inst = corp[i]
            train += [w[0] for w in inst.context if isinstance(w, tuple)]

    logging.info("  test samples start")

    lengths = {'hard': 0, 'line': 0, 'serve': 0}
    for i in samples[border:]:
        for corp in corpora:
            inst = corp[i]
            word = inst.word.split('-')[0]

            # corpus.context somtimes contains non-tuple entries:
            new_sentence_dirty = []
            for w in inst.context:
                if isinstance(w, tuple):
                    new_sentence_dirty.append(w[0])
                else:
                    new_sentence_dirty.append(w)

            # need to perform cleansing here and not in worsed for offsets to
            # be aligned
            new_sentence, new_pos = cleanse_corpus_pos_aware(new_sentence_dirty,
                                                             inst.position)
            test[word] += new_sentence
            labels[word] += inst.senses
            offsets[word].append(new_pos + lengths[word])
            lengths[word] += len(new_sentence)

    logging.info("end corpus")
    logging.info("length train: {}, length test: {}".
                 format(len(train), len(test['hard']) + len(test['line'])
                 + len(test['serve'])))
    logging.info("labels(h/l/s): {}/{}/{}".
                 format(
                     len(labels['hard']),
                     len(labels['line']),
                     len(labels['serve'])))

    return train, test, labels, offsets
Exemple #23
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import senseval
print(senseval.fileids())
print(senseval.instances('hard.pos'))

for inst in senseval.instances('interest.pos')[:10]:
    p = inst.position
    left = ' '.join(w for (w, t) in inst.context[p - 2:p])
    word = ' '.join(w for (w, t) in inst.context[p:p + 1])
    right = ' '.join(w for (w, t) in inst.context[p + 1:p + 3])
    senses = ' '.join(inst.senses)
    print('%20s |%10s | %-15s -> %s' % (left, word, right, senses))
Exemple #24
0
train_set = [(gender_features2(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features2(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features2(n), gender) for (n, gender) in test_names]
#training naivebayes classifier on the train set
classifier = nltk.NaiveBayesClassifier.train(train_set)
#print accuracy on dev test
print(nltk.classify.accuracy(classifier, devtest_set))
#Increase in classification accuracy
print(nltk.classify.accuracy(classifier, test_set))
#####################################

#3
#importing senseval package
from nltk.corpus import senseval
#getting instance of interest.pos, senseval has four different instance
instances = senseval.instances('interest.pos')
#getting 10% of the instances in size variable
size = int(len(instances) * 0.1)
#Using size variable first 10% i.e. 236 in train_set and rest of the 90% in train_set
train_set, test_set = instances[size:], instances[:size]
#train naivebayes on train_set
classifier = nltk.NaiveBayesClassifier.train(train_set)


#Defining a function to return sense feature
def sense_features(left, word, right):
    return {'prefix': left[-1:]}


#Since senseval objects are not iterateable directly
#We will use below method to iterate on it and create training and then testing set
Exemple #25
0
def WSDClasifier(trainer, 
                 word,
				 features,
				 stopwords=STOPWORDS, 
				 number=300,
				 distance=3,
				 log=False,
				 confusion_matrix=False):
	"""
	Build a classifier instance for the senseval2 senses of a word and applies it

	:param word: from senseval2 (we have 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos')
	:type string:
	:param features: selector to which feature set to use
	:type features: str (word, context)
	:param n: passed to extract_vocab when constructing the second argument to the feature set constructor
	:type int:
	:param dist: passed to the feature set constructor as 3rd argument
	:type int:
	:param log: if set to True outputs any errors into a file errors.txt
	:type bool:
	:param confusion_matrix: if set to True prints a confusion matrix
	:type bool:

	Calling this function splits the senseval data for the word into a training set and a test set (the way it does
	this is the same for each call of this function, because the argument to random.seed is specified,
	but removing this argument would make the training and testing sets different each time you build a classifier).

	It then trains the trainer on the training set to create a classifier that performs WSD on the word,
	using features (with number or distance where relevant).

	It then tests the classifier on the test set, and prints its accuracy on that set.

	If log==True, then the errors of the classifier over the test set are written to errors.txt.
	For each error four things are recorded: (i) the example number within the test data (this is simply the index of the
	example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the
	(incorrect) derived label, and (iv) the gold label.

	If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j]
	indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels
	that were correctly predicted).
	"""
	global inst_cache

	if word not in inst_cache:
		inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
		
	events = inst_cache[word][:]
	senses = list(set(l for (i, l) in events))
	instances = [i for (i, l) in events]
	vocab = extract_vocab(instances, number)
	print(' Senses: ' + ' '.join(senses))
	# Split the instances into a training and test set,
	#if N > len(events): N = len(events)
	N = len(events)
	random.seed(123456789) 
	random.shuffle(events)
	train_data = events[:int(0.8 * N)]
	test_data = events[int(0.8 * N):N]

	# Train classifier
	print('Training classifier...')
	classifier = trainer([(features(i, vocab, distance), label) for (i, label) in train_data])
	# Test classifier
	print('Testing classifier...')
	acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] )
	print('Accuracy: {:6.4f}'.format(acc))

	if log:
		#write error file
		print('Writing errors to errors.txt')
		with open('errors.txt', 'w') as file:
			errors = []
			for (i, label) in test_data:
				guess = classifier.classify(features(i, vocab, distance))
				if guess != label:
					con =  i.context
					position = i.position
					item_number = str(test_data.index((i, label)))
					word_list=[cv[0] if isinstance(cv,tuple) else cv for cv in con]
					hard_highlighted = word_list[position].upper()
					word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:]
					sentence = ' '.join(word_list_highlighted)
					errors.append([item_number, sentence, guess,label])
			file.write('There are {} errors'.format(len(errors)))
			file.write('----------------------------\n')
			for error in errors:
				idx = errors.index(error)+1
				num, snt, guess, label = error
				file.write('{}) example #: {} \n sentence: {}\n guess: {}\n label: {}\n'.format(idx, num, snt, guess, label))
					
	if confusion_matrix:
		gold = [label for (i, label) in test_data]
		derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data]
		cm = nltk.ConfusionMatrix(gold,derived)
		print(cm)
Exemple #26
0
def wst_classifier(trainer,
                   word,
                   features,
                   stopwords_list=STOPWORDS,
                   number=300,
                   log=False,
                   distance=3,
                   confusion_matrix=False):
    """
    This function takes as arguments:
        a trainer (e.g., NaiveBayesClassifier.train);
        a target word from senseval2 (you can find these out with senseval.fileids(),
            and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos');
        a feature set (this can be wsd_context_features or wsd_word_features);
        a number (defaults to 300), which determines for wsd_word_features the number of
            most frequent words within the context of a given sense that you use to classify examples;
        a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then
            wsd_context_features gives 3 words and tags to the left and 3 words and tags to
            the right of the target word);
        log (defaults to false), which if set to True outputs the errors into a file errors.txt
        confusion_matrix (defaults to False), which if set to True prints a confusion matrix.

    Calling this function splits the senseval data for the word into a training set and a test set (the way it does
    this is the same for each call of this function, because the argument to random.seed is specified,
    but removing this argument would make the training and testing sets different each time you build a classifier).

    It then trains the trainer on the training set to create a classifier that performs WSD on the word,
    using features (with number or distance where relevant).

    It then tests the classifier on the test set, and prints its accuracy on that set.

    If log==True, then the errors of the classifier over the test set are written to errors.txt.
    For each error four things are recorded: (i) the example number within the test data (this is simply the index of the
    example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the
    (incorrect) derived label, and (iv) the gold label.

    If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j]
    indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels
    that were correctly predicted).
    """
    print "Reading data..."
    global _inst_cache
    if word not in _inst_cache:
        _inst_cache[word] = [(i, i.senses[0])
                             for i in senseval.instances(word)]
    events = _inst_cache[word][:]
    senses = list(set(l for (i, l) in events))
    instances = [i for (i, l) in events]
    vocab = extract_vocab(instances, stopwords=stopwords_list, n=number)
    print ' Senses: ' + ' '.join(senses)

    # Split the instances into a training and test set,
    #if n > len(events): n = len(events)
    n = len(events)
    random.seed(5444522)
    random.shuffle(events)
    training_data = events[:int(0.8 * n)]
    test_data = events[int(0.8 * n):n]
    # Train classifier
    print 'Training classifier...'
    classifier = trainer([(features(i, vocab, distance), label)
                          for (i, label) in training_data])
    # Test classifier
    print 'Testing classifier...'
    acc = accuracy(classifier, [(features(i, vocab, distance), label)
                                for (i, label) in test_data])
    print 'Accuracy: %6.4f' % acc
    if log == True:
        #write error file
        print 'Writing errors to errors.txt'
        output_error_file = open('errors.txt', 'w')
        errors = []
        for (i, label) in test_data:
            guess = classifier.classify(features(i, vocab, distance))
            if guess != label:
                con = i.context
                position = i.position
                item_number = str(test_data.index((i, label)))
                word_list = []
                for (word, tag) in con:
                    word_list.append(word)
                hard_highlighted = word_list[position].upper()
                word_list_highlighted = word_list[0:position] + [
                    hard_highlighted
                ] + word_list[position + 1:]
                sentence = ' '.join(word_list_highlighted)
                errors.append([item_number, sentence, guess, label])
        error_number = len(errors)
        output_error_file.write('There are ' + str(error_number) + ' errors!' +
                                '\n' + '----------------------------' + '\n' +
                                '\n')
        for error in errors:
            output_error_file.write(
                str(errors.index(error) + 1) + ') ' + 'example number: ' +
                error[0] + '\n' + '    sentence: ' + error[1] + '\n' +
                '    guess: ' + error[2] + ';  label: ' + error[3] + '\n' +
                '\n')
        output_error_file.close()
    if confusion_matrix == True:
        gold = [label for (i, label) in test_data]
        derived = [
            classifier.classify(features(i, vocab)) for (i, label) in test_data
        ]
        cm = nltk.ConfusionMatrix(gold, derived)
        print cm
        return cm
Exemple #27
0
def senses(word):
    return list(set(i.senses[0] for i in senseval.instances(word)))
Exemple #28
0
# Protsay Solomia, Chapter 6, Exercise 3
import nltk
from nltk.corpus import senseval
instances = senseval.instances('serve.pos')
features=[]# Open corpus data
for inst in instances:
	context = [c if isinstance(c, tuple) else (c, "None") for c in inst.context]
        # Converting strings in "context" to (string, "None") tuples in order to create a dictionary
	f = dict(context)
	#Creating a dictionary
	f.update({"word": inst.word, "position": inst.position})
        # Updating features "word" and "position"
	features.append((f, ' '.join(inst.senses)))

	

size = int(len(features) * 0.1)# Set an amount of testing data (10%)
train_set, test_set = features[size:], features[:size]# Making two data sets (for training and testing)
classifier1 = nltk.NaiveBayesClassifier.train(train_set)# Training the classifier
print nltk.classify.accuracy(classifier1, test_set)#Evaluating the accuracy of the classifier

Exemple #29
0
# ### You need to describe what data you plan to use and how it will be partitioned into training, development/validation and test sets.
# 
# I am using the Senseval corpus. After randomization, I will split the data into training and testing sets. This is done by ... 
# 
# Validation/developement ? 
# 
# As for extracting features, I am planning on using a) context words (as in, words that appear around the focus word) and b) the 'senses' category, which represents the exact meaning of the focus word.
# 

# In[279]:


print("All fileids:", senseval.fileids())
print()
for fileid in senseval.fileids():
    print(senseval.instances(fileid)[0])
    print()


# In[280]:


def get_category(pos):
    
    category = []
    for inst in senseval.instances(pos):
        category.append(inst.senses)
    return category


# In[303]: