def clean_lemmatization(column, type): lem = WordNetLemmatizer() penn_treebank = tagset_mapping('en-ptb', 'universal') #df[type + '_tokenized_lem'] = column.apply(lambda x:[(x, wordnet_tags(tag)) for x, tag in pos_tag(word_tokenize(x))]) df[type + '_tokenized_lem'] = column.apply(lambda x:[(x, wordnet_tags(tag)) for x, tag in pos_tag(word_tokenize(x)) if wordnet_tags(tag) != wordnet.NOUN]) df[type + '_tokenized_lem'] = df[type + '_tokenized_lem'].apply(lambda x: [lem.lemmatize(z[0], z[1]) if z[1] is not None else z[0] for z in x ]) return df[type + '_tokenized_lem']
def ptb2universal(tagged_text: list) -> list: """Convert Pen Tree Bank POS extended tag set (36 tags) into universal tag set (12 tags). Parameters ---------- tagged_text: list A list of (word,POS Tag) returned by a pos tagger in the extended form Eg. VBD, VBG, VBN, VBP, VBZ. Return ------ new_text: list The same list of (word, POS Tag) but in the universal form, the above tags are change by VERB. """ new_text = [] mapa = tagset_mapping('en-ptb', 'universal') for word, pos in tagged_text: new_text.append((word, mapa[pos])) return new_text
def __init__(self): self.wordnet_lemmatizer = WordNetLemmatizer() self.mapping = tagset_mapping('en-ptb', 'universal')
import pandas as pd from nltk import word_tokenize from nltk import pos_tag from nltk.tag.mapping import tagset_mapping ''' Location of file(s) required to run the program ''' law2Vec_doc = "../data/Law2Vec/Law2Vec.200d.txt" ''' Define & initialize global constants ''' word_dimension = 200 max_premise_length, max_hypothesis_length = 200, 80 PTB_UNIVERSAL_MAP = tagset_mapping('en-ptb', 'universal') POS_categories = { '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X' } tag_dimension = len(POS_categories) # Read law2vec vectors from law2Vec_doc and store in a dictionary as [word:vector] law2vec_wordmap = {} with open(law2Vec_doc, "r", errors='ignore') as law2vec: for line in law2vec: name, vector = tuple(line.split(" ", 1)) law2vec_wordmap[name] = np.fromstring(vector, sep=" ") del law2Vec_doc, line, name, vector # delete variables no longer required to free the RAM
from nltk.tag.perceptron import PerceptronTagger # In[39]: PerceptronAv = nltk.PerceptronTagger(stratified_split_train) # 5.3. Report on accuracy, and per tag Precision, Recall, F and confusion matrix. # # The PerceptronTagger uses a different set of tags (Penn TreeBank) and all our previous work uses the Universal tagset, # so we want to map the Penn TreeBank tagset to the Universal tagset. This mapping will help us re-use previous code without change. # In[40]: from nltk.tag import mapping tag_dict = mapping.tagset_mapping('en-ptb', 'universal') # In[41]: def PerceptronmicroEvaluate(self, corpus_test): # True positive count (TP): number of words tagged as T both in the test set and by the tagger. # True negative count (TN): words tagged as non-T both in the test set and by the tagger. # False positive count (FP): words tagged as non-T in the test set and as T by the tagger. # False negative (FN): words tagged as T in the test set and as non-T by the tagger. # flatten test list testList = [item for sublist in corpus_test for item in sublist] # get words without tags testWords = [word for word, tag in testList] # get tagger's prediction