コード例 #1
0
def clean_lemmatization(column, type):
  lem = WordNetLemmatizer()
  penn_treebank = tagset_mapping('en-ptb', 'universal')
  #df[type + '_tokenized_lem'] = column.apply(lambda x:[(x, wordnet_tags(tag)) for x, tag in pos_tag(word_tokenize(x))])
  df[type + '_tokenized_lem'] = column.apply(lambda x:[(x, wordnet_tags(tag)) for x, tag in pos_tag(word_tokenize(x)) if wordnet_tags(tag) != wordnet.NOUN])
  df[type + '_tokenized_lem'] = df[type + '_tokenized_lem'].apply(lambda x: [lem.lemmatize(z[0], z[1]) if z[1] is not None else z[0] for z in x ])
  return df[type + '_tokenized_lem']
コード例 #2
0
def ptb2universal(tagged_text: list) -> list:
    """Convert Pen Tree Bank POS extended tag set (36 tags) into 
    universal tag set (12 tags).

    Parameters
    ----------

    tagged_text: list
                 A list of (word,POS Tag) returned by a pos tagger
                 in the extended form Eg. VBD, VBG, VBN, VBP, VBZ.

    Return
    ------

    new_text: list
              The same list of (word, POS Tag) but in the universal
              form, the above tags are change by VERB.

    """
    new_text = []
    mapa = tagset_mapping('en-ptb', 'universal')
    for word, pos in tagged_text:
        new_text.append((word, mapa[pos]))
    return new_text
コード例 #3
0
ファイル: ptb2ud.py プロジェクト: zhanglang1860/NegBio
 def __init__(self):
     self.wordnet_lemmatizer = WordNetLemmatizer()
     self.mapping = tagset_mapping('en-ptb', 'universal')
コード例 #4
0
import pandas as pd
from nltk import word_tokenize
from nltk import pos_tag
from nltk.tag.mapping import tagset_mapping
'''
Location of file(s) required to run the program
'''

law2Vec_doc = "../data/Law2Vec/Law2Vec.200d.txt"
'''
Define & initialize global constants
'''

word_dimension = 200
max_premise_length, max_hypothesis_length = 200, 80
PTB_UNIVERSAL_MAP = tagset_mapping('en-ptb', 'universal')
POS_categories = {
    '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT',
    'VERB', 'X'
}
tag_dimension = len(POS_categories)

# Read law2vec vectors from law2Vec_doc and store in a dictionary as [word:vector]
law2vec_wordmap = {}
with open(law2Vec_doc, "r", errors='ignore') as law2vec:
    for line in law2vec:
        name, vector = tuple(line.split(" ", 1))
        law2vec_wordmap[name] = np.fromstring(vector, sep=" ")
    del law2Vec_doc, line, name, vector  # delete variables no longer required to free the RAM

コード例 #5
0
ファイル: hw1.py プロジェクト: ibrahemi1994/NLP171
from nltk.tag.perceptron import PerceptronTagger

# In[39]:

PerceptronAv = nltk.PerceptronTagger(stratified_split_train)

# 5.3. Report on accuracy, and per tag Precision, Recall, F and confusion matrix.
#

# The PerceptronTagger uses a different set of tags (Penn TreeBank) and all our previous work uses the Universal tagset,
# so we want to map the Penn TreeBank tagset to the Universal tagset. This mapping will help us re-use previous code without change.

# In[40]:

from nltk.tag import mapping
tag_dict = mapping.tagset_mapping('en-ptb', 'universal')

# In[41]:


def PerceptronmicroEvaluate(self, corpus_test):
    # True positive count (TP): number of words tagged as T both in the test set and by the tagger.
    # True negative count (TN): words tagged as non-T both in the test set and by the tagger.
    # False positive count (FP): words tagged as non-T in the test set and as T by the tagger.
    # False negative (FN): words tagged as T in the test set and as non-T by the tagger.

    # flatten test list
    testList = [item for sublist in corpus_test for item in sublist]
    # get words without tags
    testWords = [word for word, tag in testList]
    # get tagger's prediction