def feature_keys(ind): """This function must be called if one wants to get the words used in the feature vectors for the training data. Upon supplying an index (use the instance id), it returns a list of words. This is the keys that are used in the cooccurrence feature vectors.""" # check to see if the feature vector we are looking for exists try: return cooccur_vect[ind] except KeyError: # does not exist, we have to make it # first we must make a list of common words using this sense data word_list = [] heads = [] for instance in senseval.instances(ind): # first load the cases for item and get the most common words heads = heads + [instance.context[instance.position].lower()] word_list.extend([x.lower() for x in instance.context]) # remove the stop words for x in stopwords + heads: word_list = filter(lambda w: w != x, word_list) word_counts = {} for word in word_list: if word in word_counts: word_counts[word] += 1 else: word_counts[word] = 1 # sort the list in descending order and truncate to get most common cooccur_vect[ind] = sorted(word_counts, key = word_counts.get, reverse = True) cooccur_vect[ind] = cooccur_vect[ind][:vectorSize] return cooccur_vect[ind]
def senses(word): """ This takes a target word from senseval-2 (find out what the possible are by running senseval.fileides()), and it returns the list of possible senses for the word """ return list(set(i.senses[0] for i in senseval.instances(word)))
def wsd_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False): print "Reading data..." global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = _inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, stopwords=stopwords_list, n=number) print ' Senses: ' + ' '.join(senses) # Split the instances into a training and test set, #if n > len(events): n = len(events) n = len(events) random.seed(5444522) random.shuffle(events) training_data = events[:int(0.8 * n)] test_data = events[int(0.8 * n):n] # Train classifier print 'Training classifier...' classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data]) # Test classifier print 'Testing classifier...' acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] ) print 'Accuracy: %6.4f' % acc if log==True: #write error file print 'Writing errors to errors.txt' output_error_file = open('errors.txt', 'w') errors = [] for (i, label) in test_data: guess = classifier.classify(features(i, vocab, distance)) if guess != label: con = i.context position = i.position item_number = str(test_data.index((i, label))) word_list = [] for (word, tag) in con: word_list.append(word) hard_highlighted = word_list[position].upper() word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:] sentence = ' '.join(word_list_highlighted) errors.append([item_number, sentence, guess,label]) error_number = len(errors) output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' + '\n' + '\n') for error in errors: output_error_file.write(str(errors.index(error)+1) +') ' + 'example number: ' + error[0] + '\n' + ' sentence: ' + error[1] + '\n' + ' guess: ' + error[2] + '; label: ' + error[3] + '\n' + '\n') output_error_file.close() if confusion_matrix==True: gold = [label for (i, label) in test_data] derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data] cm = nltk.ConfusionMatrix(gold,derived) print cm return cm
def test(test_iter, folds, training_folds): from nltk.corpus import senseval results = [] for i in range(test_iter): print "iteration %d ..." % (i + 1) ini_set = split_set(folds, senseval.instances()) for j in range(folds): print"...fold %d ..." % (j + 1) set = partition_set(training_folds, ini_set, j) trndict = get_most_sensed_list(set[0]) results.append(most_sensed_checked(trndict, set[1])) return results
def sense_anal(word): print "word: ", word senses = [] for inst in seval.instances(word): senses += inst.senses print 'senses: ', set(senses) print "sentences: ", len(senses) borders = [(0, senses[0])] for i in range(1, len(senses)): if senses[i] != senses[i-1]: borders.append((i, senses[i])) print "borders: ", borders
def test(test_iter, folds, training_folds): results = [] mem = Memory() for i in range(test_iter): print "iteration %d ..." % (i + 1) ini_set = split_set2(folds, senseval.instances()[0:]) for j in range(folds): print"...fold %d ..." % (j + 1) sets = partition_set(training_folds, ini_set, j) print "-$$Train time$$-" mem.train(sets[0]) print "-$$results time$$-" results.append(mem.test(sets[1])) return results
def test_main(): mem = Memory() print "loading data_set" ini_set = split_set2(5, senseval.instances()[0:10000]) data_set = partition_set(4, ini_set, 0) #Serializer.save("/tmp/portioned_data", data_set) #data_set = Serializer.load("/tmp/portioned_data") print "training data" mem.train(data_set[0]) #print "saving data" #mem.save_values("/tmp/mem_internals") #mem.load_values("/tmp/mem_internals") print "------*********testing**********------" results = mem.test(data_set[1]) print "%3.1f %% accuracy" %(sum(results)/len(results) * 100)
def wsd_demo(trainer, word, features, n=1000): from nltk.corpus import senseval import random # Get the instances. print('Reading data...') global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] instances = _inst_cache[word][:] if n > len(instances): n = len(instances) senses = list(set(l for (i, l) in instances)) print(' Senses: ' + ' '.join(senses)) # Randomly split the names into a test & train set. print('Splitting into test & train...') random.seed(123456) random.shuffle(instances) train = instances[:int(.8*n)] test = instances[int(.8*n):n] # Train up a classifier. print('Training classifier...') classifier = trainer([(features(i), l) for (i, l) in train]) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(i), l) for (i, l) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(i) for (i, n) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll) / len(test))) except NotImplementedError: pass # Return the classifier return classifier
def batch_classify(items, tests): senses = [] for item in items: print >> sys.stderr, "classifying %s" % item lexitem = ".".join(item.split(".")[0:2]) trains=\ [dict(context=instance.context,\ position=instance.position,\ senses=instance.senses)\ for instance in senseval.instances(item)] train=build_train(item, trains) test=build_test(item, tests[lexitem]) # TODO(astory): make dynamic? for i in range(BOOTSTRAP_REPS): classified = classify(train,test) train = bootstrap(train, test, classified) senses.extend(classify(train,test)) return senses
for item in b: key= "fol"+str(b.index(item)+1)+"-word" value= item dictionary[key]=value key= "fol"+str(b.index(item)+1)+"-pos" text = nltk.word_tokenize(item) value= nltk.pos_tag(text)[0][1] dictionary[key]=value return dictionary if __name__=="__main__": for item in items: totalResult= [] windowSize=4 dictionary={} for instance in senseval.instances(item)[:10]: pos = instance.position context = instance.context senses = instance.senses #print context #print context[pos] d= colocation(windowSize, pos, context,dictionary) print d
def wst_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False): """ This function takes as arguments: a trainer (e.g., NaiveBayesClassifier.train); a target word from senseval2 (you can find these out with senseval.fileids(), and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos'); a feature set (this can be wsd_context_features or wsd_word_features); a number (defaults to 300), which determines for wsd_word_features the number of most frequent words within the context of a given sense that you use to classify examples; a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then wsd_context_features gives 3 words and tags to the left and 3 words and tags to the right of the target word); log (defaults to false), which if set to True outputs the errors into a file errors.txt confusion_matrix (defaults to False), which if set to True prints a confusion matrix. Calling this function splits the senseval data for the word into a training set and a test set (the way it does this is the same for each call of this function, because the argument to random.seed is specified, but removing this argument would make the training and testing sets different each time you build a classifier). It then trains the trainer on the training set to create a classifier that performs WSD on the word, using features (with number or distance where relevant). It then tests the classifier on the test set, and prints its accuracy on that set. If log==True, then the errors of the classifier over the test set are written to errors.txt. For each error four things are recorded: (i) the example number within the test data (this is simply the index of the example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the (incorrect) derived label, and (iv) the gold label. If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j] indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels that were correctly predicted). """ print "Reading data..." global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = _inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, stopwords=stopwords_list, n=number) print ' Senses: ' + ' '.join(senses) # Split the instances into a training and test set, #if n > len(events): n = len(events) n = len(events) random.seed(5444522) random.shuffle(events) training_data = events[:int(0.8 * n)] test_data = events[int(0.8 * n):n] # Train classifier print 'Training classifier...' classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data]) # Test classifier print 'Testing classifier...' acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] ) print 'Accuracy: %6.4f' % acc if log==True: #write error file print 'Writing errors to errors.txt' output_error_file = open('errors.txt', 'w') errors = [] for (i, label) in test_data: guess = classifier.classify(features(i, vocab, distance)) if guess != label: con = i.context position = i.position item_number = str(test_data.index((i, label))) word_list = [] for (word, tag) in con: word_list.append(word) hard_highlighted = word_list[position].upper() word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:] sentence = ' '.join(word_list_highlighted) errors.append([item_number, sentence, guess,label]) error_number = len(errors) output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' + '\n' + '\n') for error in errors: output_error_file.write(str(errors.index(error)+1) +') ' + 'example number: ' + error[0] + '\n' + ' sentence: ' + error[1] + '\n' + ' guess: ' + error[2] + '; label: ' + error[3] + '\n' + '\n') output_error_file.close() if confusion_matrix==True: gold = [label for (i, label) in test_data] derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data] cm = nltk.ConfusionMatrix(gold,derived) print cm return cm
import random import nltk from nltk.corpus import senseval instances = senseval.instances('hard.pos') size = int(len(instances) * 0.1) train_set, test_set = instances[size:], instances[:size] """for i in train_set: print(i.context) """ def features(instance): feat = dict() p = instance.position if p: feat['wp'] = instance.context[p - 1][0] feat['tp'] = instance.context[p - 1][1] else: # feat['wp'] = (p, 'BOS') feat['tp'] = (p, 'BOS') feat['wf'] = instance.context[p + 1][0] feat['tf'] = instance.context[p + 1][1] return feat featureset = [(features(i), i.senses[0]) for i in instances if len(i.senses) == 1] print(featureset)
def wst_classifier(trainer, word, features, stopwords_list = STOPWORDS, number=600, log=False, distance=3, confusion_matrix=False): """ This function takes as arguments: a trainer (e.g., NaiveBayesClassifier.train); a target word from senseval2 (you can find these out with senseval.fileids(), and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos'); a feature set (this can be wsd_context_features or wsd_word_features); a number (defaults to 300), which determines for wsd_word_features the number of most frequent words within the context of a given sense that you use to classify examples; a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then wsd_context_features gives 3 words and tags to the left and 3 words and tags to the right of the target word); log (defaults to false), which if set to True outputs the errors into a file errors.txt confusion_matrix (defaults to False), which if set to True prints a confusion matrix.""" print "Reading data..." global _inst_cache #print "",senseval.instances(word)[0] if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = _inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, stopwords=stopwords_list, n=number) print ' Senses: ' + ' '.join(senses) # Split the instances into a training and test set, #if n > len(events): n = len(events) n = len(events) print n #random.seed(5444522) random.shuffle(events) training_data = events[:int(0.9 * n)] test_data = events[int(0.1 * n):n] path = "test_data_"+ word +".tsv" #creating test data for similarity algo with open(path,"w") as f: for (text,label) in test_data: sentences = "" for wordTag in text.context: if len(wordTag)==2: word,tag = wordTag sentences +=word +" " f.write(""+label+"\t"+sentences +"\n") startSimilarity(path) # Train classifier print 'Training classifier...' #print training_data[0] classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data]) # Test classifier print 'Testing classifier...' acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] ) print 'Accuracy: %6.4f' % acc if confusion_matrix==True: gold = [label for (i, label) in test_data] derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data] #print derived cm = nltk.ConfusionMatrix(gold,derived) print "Machine Learning Confusion-Matrix" print cm return cm
#wst_classifier(NaiveBayesClassifier.train, 'line.pos', wsd_word_features,distance=3, confusion_matrix=True) #wst_classifier(NaiveBayesClassifier.train, 'serve.pos', wsd_word_features,distance=3, confusion_matrix=True) """wst_classifier(NaiveBayesClassifier.train, 'interest.pos', wsd_context_features,distance=3, confusion_matrix=True) wst_classifier(NaiveBayesClassifier.train, 'hard.pos', wsd_context_features,distance=3, confusion_matrix=True) wst_classifier(NaiveBayesClassifier.train, 'line.pos', wsd_context_features,distance=3, confusion_matrix=True) wst_classifier(NaiveBayesClassifier.train, 'serve.pos', wsd_context_features,distance=3, confusion_matrix=True)""" # logistic regression === max Entropy classifier wst_classifier(MaxentClassifier.train, 'interest.pos', wsd_word_features,distance=3, confusion_matrix=True) """wst_classifier(MaxentClassifier.train, 'hard.pos', wsd_word_features,distance=3, confusion_matrix=True) wst_classifier(MaxentClassifier.train, 'line.pos', wsd_word_features,distance=3, confusion_matrix=True) wst_classifier(MaxentClassifier.train, 'serve.pos', wsd_word_features,distance=3, confusion_matrix=True)""" """wst_classifier(MaxentClassifier.train, 'interest.pos', wsd_context_features,distance=3, confusion_matrix=True) wst_classifier(MaxentClassifier.train, 'hard.pos', wsd_context_features,distance=3, confusion_matrix=True) wst_classifier(MaxentClassifier.train, 'line.pos', wsd_context_features,distance=3, confusion_matrix=True) wst_classifier(MaxentClassifier.train, 'serve.pos', wsd_context_features,distance=3, confusion_matrix=True)""" start() # Frequency Baseline sense_fd = nltk.FreqDist([i.senses[0] for i in senseval.instances('hard.pos')]) most_frequent_sense = sense_fd.keys()[0] frequency_sense_baseline = sense_fd.freq(sense_fd.keys()[0]) print "frequency baseline:" ,frequency_sense_baseline ##0.79736902838679902
def create_labeled_data(): # collect all data from the corpus interest = senseval.instances('interest.pos') # create labeled data labeled_data = ... return labeled_data
sense = { 'interest_1': 0, 'interest_2': 1, 'interest_3': 2, 'interest_4': 3, 'interest_5': 4, 'interest_6': 5 } bayes = [[], [], [], [], [], []] count = [0, 0, 0, 0, 0, 0] n = 0 for instance in senseval.instances('interest.pos')[0:1599]: count[sense[instance.senses[0]]] += 1 sentence = ' '.join(w for (w, p) in instance.context) parsed = list(parser.parse(tokenizer.tokenize(sentence))) for triple in parsed[0].triples(): related = 0 if triple[0][0] in interest: word = triple[2][0] related = 1 if triple[2][0] in interest: word = triple[0][0] related = 1 if related == 1: exist = 0 for item in bayes[sense[instance.senses[0]]]: if item[0] == word:
def get_category(pos): category = [] for inst in senseval.instances(pos): category.append(inst.senses) return category
'interest_4': 3, 'interest_5': 4, 'interest_6': 5 } bayes = pickle.load(open('bayes_bag.txt', 'r')) count = pickle.load(open('count.txt', 'r')) correct = 0 base = 0 n = 0 for instance in senseval.instances('interest.pos')[1600:2000]: score = [] for num in count[0:6]: score.append(math.log(num / 1600.0)) p = instance.position + 2 sentence = list(['<BOS1>', '<BOS2>']) for word in instance.context: sentence.append(word[0]) sentence.append('<EOS1>') sentence.append('<EOS2>') bag = [sentence[p - 2], sentence[p - 1], sentence[p + 1], sentence[p + 2]] for word in bag:
for sample in samples: context_index = sample.context freq_dict = Counter(context_index) tuples = [tuple([x, y]) for x, y in freq_dict.items()] ids, counts = zip(*tuples) C[iter, ids] = counts iter += 1 return C if __name__ == "__main__": t0 = time.time() instances = senseval.instances(hard_f) # all training samples as a list samples = [sample(inst) for inst in instances] # V is size of Vocab, K is number of clusters word_to_id, V, K = create_vocab(samples) # convert contexts to indices so they can be used for indexing for sample in samples: sample.context_to_index(word_to_id) # initialize vj|s, priors EM = EM(V, K) # C is a sample_size * vocab_size matrix C = counts_matrix(samples, V)
NO_STOPWORDS = [] print "Came here" """ Get POS tag based features """ #POS = set() #POS_DICT = {'': 0, 'PRP$': 1, 'VBG': 2, 'VBD': 3, 'VB': 26, "''": 5, 'VBP': 6, 'WDT': 7, 'JJ': 8, 'WP': 9, 'VBZ': 10, 'DT': 11, '"': 12, 'RP': 13, '$': 14, 'NN': 15, '(': 16, 'FW': 17, 'POS': 18, '.': 19, 'TO': 20, 'PRP': 21, 'RB': 22, ':': 23, 'NNS': 24, 'NNP': 25, '``': 4, 'WRB': 27, 'CC': 28, 'PDT': 30, 'RBS': 31, 'RBR': 32, 'VBN': 33, 'R': 34, 'EX': 35, 'IN': 36, 'WP$': 37, 'CD': 38, 'MD': 39, 'NNPS': 40, 'h': 41, 'NNP ': 45, 'JJS': 42, 'JJR': 43, 'SYM': 44, 's': 29, 'UH': 46, 'VBP ': 47} #hard_rtnsl = ['through', u'shoe', 'skin', 'find', 'ground', u'discipline', u'ha', 'had', 'to', 'going', u'board', 'do', 'good', 'get', 'very', 'material', u'capsule', u'breast', 'day', 'people', u'seat', 'see', 'are', 'packed', 'out', 'even', 'for', 'crust', 'enough', 'between', 'red', 'be', 'wheat', 'dirt', 'imagine', 'carbide', 'come', 'on', 'stone', 'her', 'of', 'taking', 'keep', 'turn', 'place', 'cheese', 'into', u'one', 'down', 'fast', 'little', 'long', u'eye', 'would', 'been', 'plastic', 'much', 'way', 'taken', 'tell', u'shell', 'took', 'part', 'determination', u'line', 'believe', 'with', 'myself', 'look', 'this', 'science', 'up', 'making', u'feeling', 'study', 'is', 'surface', 'evidence', 'at', 'have', 'in', 'court', 'winter', 'no', 'make', 'reality', 'rubber', 'take', 'so', "'s", 'sided', 'enamel', 'coat', u'cover', u'face', 'edge', 'green', 'time', 'baked', 'having', u'fact', 'know'] #line_rtnls = ['taxi', 'walking', 'answering', 'deck', 'telecom', u'executive', 'hamlet', 'through', 'fishing', 'crowded', 'fine', 'profitable', 'cut', 'personal', 'lake', 'should', 'to', u'minute', u'joke', 'complaint', u'tourist', 'outside', 'food', 'jerking', 'five', 'drawn', u'walk', 'pier', 'bank', u'loss', 'like', 'cable', u'transmission', 'gender', 'motorboat', 'blurrier', 'specific', 'fisherman', 'crossing', 'hotline', 'river', 'side', 'clothes', 'draw', 'old', 'people', 'acquired', 'attached', 'fish', u'traveler', 'direct', 'blurred', u'computer', 'trading', 'are', u'sea', u'year', 'separating', 'laundry', 'racial', 'investment', 'network', 'for', 'waiting', 'profit', 'legal', 'access', 'written', 'blur', 'new', 'reading', 'across', 'blurry', 'be', u'telecommunication', 'business', 'exchange', 'sold', 'communicate', 'drew', 'water', 'busy', 'corp', 'snapped', 'along', 'by', 'tug', 'on', 'about', 'carried', 'jeep', 'of', 'industry', 'drag', 'against', 'bow', 'telesis', 'airport', 'tangled', 'stand', 'social', 'retail', 'first', 'co', 'bell', u'communication', 'into', 'private', 'one', 'hook', 'jammed', 'fast', '176', u'open', 'market', 'speak', 'standing', 'toy', 'from', 'tread', 'service', 'two', 'long', 'subscriber', 'pc', 'vax', 'call', u'vehicle', u'wait', 'checkout', u'store', 'more', 'flat', 'dialogue', 'selling', 'door', 'forming', 'company', 'formed', 'phone', 'understand', u'switchboard', 'catch', 'fastened', 'with', 'than', u'customer', 'novel', u'word', u'hour', 'these', u'car', 'non-art', u'caller', 'gasoline', 'up', u'rope', 'cast', 'crossed', 'thin', 'editorial', 'were', 'called', 'acquisition', 'toll-free', u'ad', 'toss', 'share', 'hauling', 'heard', u'say', 'pulling', 'at', 'have', 'in', 'ship', u'dealer', 'film', 'inc', 'sell', 'end', u'conversation', 'secured', 'get', 'brand', 'cross', u'actor', 'uttered', u'book', u'speech', 'catfish', 'switching', 'long-distance', u'product', 'exactly', "'s", u'price', 'ideological', u'hang', 'tied', 'tow', '000', 'pulled', 'delicate', 'such', 'blurring', 'single', 'off', 'third', 'largely', 'consumer', 'clear', u'sale', 'drawing', 'green', 'enter', 'apparel', 'buoy', 'corporate', 'divided', 'reserve'] print "Came here again" instances1 = sense_instances(senseval.instances('hard.pos'), 'HARD1') instances2 = sense_instances(senseval.instances('hard.pos'), 'HARD2') instances3 = sense_instances(senseval.instances('hard.pos'), 'HARD3') instances4 = sense_instances(senseval.instances('line.pos'), 'cord') instances5 = sense_instances(senseval.instances('line.pos'), 'division') instances6 = sense_instances(senseval.instances('line.pos'), 'formation') instances7 = sense_instances(senseval.instances('line.pos'), 'product') instances8 = sense_instances(senseval.instances('line.pos'), 'text') instances9 = sense_instances(senseval.instances('line.pos'), 'phone') lmtzr = WordNetLemmatizer() print "Came here" def modify_instance_with_CRFtag(index,filename,instances): j = index
#We built the model on train and test on dev_test_set and reworked the model until we had good #results on both train and dev_test_set. However, that means we probably still overfit a bit to those two #datasets. We would expect to perform slightly worse on the test_set and that is what happened. #3 The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. It contains data for #four words: hard, interest, line, and serve. Choose one of these four words, and load the corresponding data: #Using this dataset, build a classifier that predicts the correct sense tag for a given instance. See the corpus HOWTO #at http://nltk.org/howto for information on using the instance objects returned by the Senseval 2 Corpus. import nltk from nltk.corpus import senseval import random #I chose to use the word 'serve' instances = senseval.instances('serve.pos') size = int(len(instances) * 0.1) for inst in instances[:5]: p = inst.position left = ' '.join(w for (w,t) in inst.context[p-2:p]) word = ' '.join(w for (w,t) in inst.context[p:p+1]) right = ' '.join(w for (w,t) in inst.context[p+1:p+3]) senses = ' '.join(inst.senses) def features(instance): feat = dict() p = instance.position ## previous word and tag if p: ## > 0 feat['wp'] = instance.context[p-1][0]
def split_corpus(train_p=0.8, sample_num=1100, sample_range=1100): logging.info("start corpus") logging.info(" restriction starts") # restrict corpora to 2 most common senses hard = seval.instances("hard.pos")[0:631] hard += seval.instances("hard.pos")[3455:3957] line = seval.instances("line.pos")[1096:2200] serve = seval.instances("serve.pos")[0:600] serve += seval.instances("serve.pos")[2486:3086] logging.info(" value setting starts") train = [] test = {'hard': [], 'line': [], 'serve': []} labels = {'hard': [], 'line': [], 'serve': []} offsets = {'hard': [], 'line': [], 'serve': []} corpora = [hard, line, serve] samples = sample(range(sample_range), sample_num) # random order for sentences border = int(sample_num * train_p) logging.info(" training samples start") # ambiguous words alterning to prevent skew for i in samples[:border]: for corp in corpora: inst = corp[i] train += [w[0] for w in inst.context if isinstance(w, tuple)] logging.info(" test samples start") lengths = {'hard': 0, 'line': 0, 'serve': 0} for i in samples[border:]: for corp in corpora: inst = corp[i] word = inst.word.split('-')[0] # corpus.context somtimes contains non-tuple entries: new_sentence_dirty = [] for w in inst.context: if isinstance(w, tuple): new_sentence_dirty.append(w[0]) else: new_sentence_dirty.append(w) # need to perform cleansing here and not in worsed for offsets to # be aligned new_sentence, new_pos = cleanse_corpus_pos_aware(new_sentence_dirty, inst.position) test[word] += new_sentence labels[word] += inst.senses offsets[word].append(new_pos + lengths[word]) lengths[word] += len(new_sentence) logging.info("end corpus") logging.info("length train: {}, length test: {}". format(len(train), len(test['hard']) + len(test['line']) + len(test['serve']))) logging.info("labels(h/l/s): {}/{}/{}". format( len(labels['hard']), len(labels['line']), len(labels['serve']))) return train, test, labels, offsets
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import senseval print(senseval.fileids()) print(senseval.instances('hard.pos')) for inst in senseval.instances('interest.pos')[:10]: p = inst.position left = ' '.join(w for (w, t) in inst.context[p - 2:p]) word = ' '.join(w for (w, t) in inst.context[p:p + 1]) right = ' '.join(w for (w, t) in inst.context[p + 1:p + 3]) senses = ' '.join(inst.senses) print('%20s |%10s | %-15s -> %s' % (left, word, right, senses))
train_set = [(gender_features2(n), gender) for (n, gender) in train_names] devtest_set = [(gender_features2(n), gender) for (n, gender) in devtest_names] test_set = [(gender_features2(n), gender) for (n, gender) in test_names] #training naivebayes classifier on the train set classifier = nltk.NaiveBayesClassifier.train(train_set) #print accuracy on dev test print(nltk.classify.accuracy(classifier, devtest_set)) #Increase in classification accuracy print(nltk.classify.accuracy(classifier, test_set)) ##################################### #3 #importing senseval package from nltk.corpus import senseval #getting instance of interest.pos, senseval has four different instance instances = senseval.instances('interest.pos') #getting 10% of the instances in size variable size = int(len(instances) * 0.1) #Using size variable first 10% i.e. 236 in train_set and rest of the 90% in train_set train_set, test_set = instances[size:], instances[:size] #train naivebayes on train_set classifier = nltk.NaiveBayesClassifier.train(train_set) #Defining a function to return sense feature def sense_features(left, word, right): return {'prefix': left[-1:]} #Since senseval objects are not iterateable directly #We will use below method to iterate on it and create training and then testing set
def WSDClasifier(trainer, word, features, stopwords=STOPWORDS, number=300, distance=3, log=False, confusion_matrix=False): """ Build a classifier instance for the senseval2 senses of a word and applies it :param word: from senseval2 (we have 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos') :type string: :param features: selector to which feature set to use :type features: str (word, context) :param n: passed to extract_vocab when constructing the second argument to the feature set constructor :type int: :param dist: passed to the feature set constructor as 3rd argument :type int: :param log: if set to True outputs any errors into a file errors.txt :type bool: :param confusion_matrix: if set to True prints a confusion matrix :type bool: Calling this function splits the senseval data for the word into a training set and a test set (the way it does this is the same for each call of this function, because the argument to random.seed is specified, but removing this argument would make the training and testing sets different each time you build a classifier). It then trains the trainer on the training set to create a classifier that performs WSD on the word, using features (with number or distance where relevant). It then tests the classifier on the test set, and prints its accuracy on that set. If log==True, then the errors of the classifier over the test set are written to errors.txt. For each error four things are recorded: (i) the example number within the test data (this is simply the index of the example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the (incorrect) derived label, and (iv) the gold label. If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j] indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels that were correctly predicted). """ global inst_cache if word not in inst_cache: inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, number) print(' Senses: ' + ' '.join(senses)) # Split the instances into a training and test set, #if N > len(events): N = len(events) N = len(events) random.seed(123456789) random.shuffle(events) train_data = events[:int(0.8 * N)] test_data = events[int(0.8 * N):N] # Train classifier print('Training classifier...') classifier = trainer([(features(i, vocab, distance), label) for (i, label) in train_data]) # Test classifier print('Testing classifier...') acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data] ) print('Accuracy: {:6.4f}'.format(acc)) if log: #write error file print('Writing errors to errors.txt') with open('errors.txt', 'w') as file: errors = [] for (i, label) in test_data: guess = classifier.classify(features(i, vocab, distance)) if guess != label: con = i.context position = i.position item_number = str(test_data.index((i, label))) word_list=[cv[0] if isinstance(cv,tuple) else cv for cv in con] hard_highlighted = word_list[position].upper() word_list_highlighted = word_list[0:position] + [hard_highlighted] + word_list[position+1:] sentence = ' '.join(word_list_highlighted) errors.append([item_number, sentence, guess,label]) file.write('There are {} errors'.format(len(errors))) file.write('----------------------------\n') for error in errors: idx = errors.index(error)+1 num, snt, guess, label = error file.write('{}) example #: {} \n sentence: {}\n guess: {}\n label: {}\n'.format(idx, num, snt, guess, label)) if confusion_matrix: gold = [label for (i, label) in test_data] derived = [classifier.classify(features(i,vocab)) for (i,label) in test_data] cm = nltk.ConfusionMatrix(gold,derived) print(cm)
def wst_classifier(trainer, word, features, stopwords_list=STOPWORDS, number=300, log=False, distance=3, confusion_matrix=False): """ This function takes as arguments: a trainer (e.g., NaiveBayesClassifier.train); a target word from senseval2 (you can find these out with senseval.fileids(), and they are 'hard.pos', 'interest.pos', 'line.pos' and 'serve.pos'); a feature set (this can be wsd_context_features or wsd_word_features); a number (defaults to 300), which determines for wsd_word_features the number of most frequent words within the context of a given sense that you use to classify examples; a distance (defaults to 3) which determines the size of the window for wsd_context_features (if distance=3, then wsd_context_features gives 3 words and tags to the left and 3 words and tags to the right of the target word); log (defaults to false), which if set to True outputs the errors into a file errors.txt confusion_matrix (defaults to False), which if set to True prints a confusion matrix. Calling this function splits the senseval data for the word into a training set and a test set (the way it does this is the same for each call of this function, because the argument to random.seed is specified, but removing this argument would make the training and testing sets different each time you build a classifier). It then trains the trainer on the training set to create a classifier that performs WSD on the word, using features (with number or distance where relevant). It then tests the classifier on the test set, and prints its accuracy on that set. If log==True, then the errors of the classifier over the test set are written to errors.txt. For each error four things are recorded: (i) the example number within the test data (this is simply the index of the example within the list test_data); (ii) the sentence that the target word appeared in, (iii) the (incorrect) derived label, and (iv) the gold label. If confusion_matrix==True, then calling this function prints out a confusion matrix, where each cell [i,j] indicates how often label j was predicted when the correct label was i (so the diagonal entries indicate labels that were correctly predicted). """ print "Reading data..." global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] events = _inst_cache[word][:] senses = list(set(l for (i, l) in events)) instances = [i for (i, l) in events] vocab = extract_vocab(instances, stopwords=stopwords_list, n=number) print ' Senses: ' + ' '.join(senses) # Split the instances into a training and test set, #if n > len(events): n = len(events) n = len(events) random.seed(5444522) random.shuffle(events) training_data = events[:int(0.8 * n)] test_data = events[int(0.8 * n):n] # Train classifier print 'Training classifier...' classifier = trainer([(features(i, vocab, distance), label) for (i, label) in training_data]) # Test classifier print 'Testing classifier...' acc = accuracy(classifier, [(features(i, vocab, distance), label) for (i, label) in test_data]) print 'Accuracy: %6.4f' % acc if log == True: #write error file print 'Writing errors to errors.txt' output_error_file = open('errors.txt', 'w') errors = [] for (i, label) in test_data: guess = classifier.classify(features(i, vocab, distance)) if guess != label: con = i.context position = i.position item_number = str(test_data.index((i, label))) word_list = [] for (word, tag) in con: word_list.append(word) hard_highlighted = word_list[position].upper() word_list_highlighted = word_list[0:position] + [ hard_highlighted ] + word_list[position + 1:] sentence = ' '.join(word_list_highlighted) errors.append([item_number, sentence, guess, label]) error_number = len(errors) output_error_file.write('There are ' + str(error_number) + ' errors!' + '\n' + '----------------------------' + '\n' + '\n') for error in errors: output_error_file.write( str(errors.index(error) + 1) + ') ' + 'example number: ' + error[0] + '\n' + ' sentence: ' + error[1] + '\n' + ' guess: ' + error[2] + '; label: ' + error[3] + '\n' + '\n') output_error_file.close() if confusion_matrix == True: gold = [label for (i, label) in test_data] derived = [ classifier.classify(features(i, vocab)) for (i, label) in test_data ] cm = nltk.ConfusionMatrix(gold, derived) print cm return cm
def senses(word): return list(set(i.senses[0] for i in senseval.instances(word)))
# Protsay Solomia, Chapter 6, Exercise 3 import nltk from nltk.corpus import senseval instances = senseval.instances('serve.pos') features=[]# Open corpus data for inst in instances: context = [c if isinstance(c, tuple) else (c, "None") for c in inst.context] # Converting strings in "context" to (string, "None") tuples in order to create a dictionary f = dict(context) #Creating a dictionary f.update({"word": inst.word, "position": inst.position}) # Updating features "word" and "position" features.append((f, ' '.join(inst.senses))) size = int(len(features) * 0.1)# Set an amount of testing data (10%) train_set, test_set = features[size:], features[:size]# Making two data sets (for training and testing) classifier1 = nltk.NaiveBayesClassifier.train(train_set)# Training the classifier print nltk.classify.accuracy(classifier1, test_set)#Evaluating the accuracy of the classifier
# ### You need to describe what data you plan to use and how it will be partitioned into training, development/validation and test sets. # # I am using the Senseval corpus. After randomization, I will split the data into training and testing sets. This is done by ... # # Validation/developement ? # # As for extracting features, I am planning on using a) context words (as in, words that appear around the focus word) and b) the 'senses' category, which represents the exact meaning of the focus word. # # In[279]: print("All fileids:", senseval.fileids()) print() for fileid in senseval.fileids(): print(senseval.instances(fileid)[0]) print() # In[280]: def get_category(pos): category = [] for inst in senseval.instances(pos): category.append(inst.senses) return category # In[303]: