def getData(corpus="brown", categories=""):
    if corpus == "brown":
        if categories != "":
            return brown.tagged_sents(tagset='universal',
                                      categories=categories)

        return brown.tagged_sents(tagset='universal')
    elif corpus == "treebank":
        return treebank.tagged_sents(tagset='universal')
    elif corpus == "nps_chat":
        #Dialogue dataset
        data = []
        posts = nps_chat.posts()
        words = nps_chat.tagged_words(tagset='universal')

        index = 0
        for sent in posts:
            data.append(words[index:index + len(sent)])
            index += len(sent)
        return data

    elif corpus == "conll2000":
        return conll2000.tagged_sents(tagset='universal')

    return brown.tagged_sents(tagset='universal')
def part_of_speech(sentence):
    chat_tagged = nps_chat.tagged_words(tagset="universal")
    tuple_list = chat_tagged
    word = []
    category = []
    for a_tuple in tuple_list:
        word.append(a_tuple[0])
    for a_tuple in tuple_list:
        category.append(a_tuple[1])
    df = pd.DataFrame(zip(word,category),columns=['word', 'category'])
    df_unique = df.drop_duplicates(subset=['word'])# subset=['word'] -> if we want to get the first  category for each word
    lemmatizer = WordNetLemmatizer()
    aux = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence.lower())]
    aux_1 = pd.DataFrame(aux,columns=['word'])
    aux_2 = pd.DataFrame(aux_1.word.value_counts().index,columns=['word'])
    return pd.merge(df_unique, aux_2, on=['word'],how='inner', validate='1:1')
Beispiel #3
0
from nltk.corpus import nps_chat as chat
from nltk import RegexpTagger
from nltk import UnigramTagger
from nltk import BigramTagger
from nltk import TrigramTagger

sizeB = len(brown.tagged_sents())  #length of size of brown corpus
sizeC = len(chat.tagged_posts())  #length of size of NPS corpus

brownTS = brown.tagged_sents()
brownTW = brown.tagged_words(
)  #partition sentences into a list with each word containing its tag

chatTP = chat.tagged_posts(
)  #partition words into a list with each post containing its tag
chatTW = chat.tagged_words(
)  #partition words into a list with each word containing its tag


def splitSen(c, p):  #function to partition corpus
    if c == "brown":
        t1 = brownTS[:int(sizeB * p)]
        t2 = brownTS[int(sizeB * p):]
        return t1, t2
    if c == "chat":
        t1 = chatTP[:int(sizeC * p)]
        t2 = chatTP[int(sizeC * p):]
        return t1, t2


brownT50, brownT50 = splitSen("brown", 0.5)
brownT90, brownT90 = splitSen("brown", 0.9)
Beispiel #4
0
# Return the POS of a rule (used for list sorting)
def get_key(rule):
    return rule.split()[1]


if __name__ == '__main__':
    # Get allowed words
    allowed_words_file = open('../../allowed_words.txt', 'r')
    allowed_words = allowed_words_file.read().split('\n')

    # Tagged words from corpora
    treebank_tagged_words = list(set(treebank.tagged_words()))
    conll2000_tagged_words = list(set(conll2000.tagged_words()))
    brown_tagged_words = list(set(brown.tagged_words()))
    nps_tagged_words = list(set(nps_chat.tagged_words()))

    vocab_rules = []
    unvocabbed_words = []

    # Find tags that occur with allowed words in the corpora
    for word in allowed_words:
        curr_tags = get_tags_linear(word, treebank_tagged_words)

        if not curr_tags:
            curr_tags = get_tags_linear(word, conll2000_tagged_words)

        if not curr_tags:
            curr_tags = get_tags_linear(word, brown_tagged_words)

        if not curr_tags:
            loc = sent.find(spelling, current_loc)
            sent = sent[:loc + len(spelling)] + info + sent[loc +
                                                            len(spelling):]
            current_loc = loc + len(spelling) + len(info)
        format_output['sentence'][row_idx] = sent
        print(sent, '--- Source: ', row['citation'])

    format_output.to_csv(filename, index=False, header=0)
    return


## Set up basic corpora
pron_dict = cmudict.dict()
brown_words = brown.tagged_words(tagset='universal')
treebank_words = treebank.tagged_words(tagset='universal')
nps_words = nps_chat.tagged_words(tagset='universal')
corpus = brown_words + treebank_words + nps_words
corpus = [(word.lower(), tag) for (word, tag) in corpus]
stopset = set(stopwords.words('english'))
## Set up pretrained spaCy's word vector
nlp = spacy.load('en_core_web_lg')

## Collect potential heteronyms
data = get_het_from_corpus(corpus)

## Assign Wiktionary data to the potential heteronyms
parser = init_wikparser()
data = get_pronunciation(parser, data)
fine_data = fine_graining(data)

## Create reference dictionary for heteronyms
size_nps_chat_09 = int(len(tagged_posts_nps_chat) * 0.9)
size_nps_chat_05 = int(len(tagged_posts_nps_chat) * 0.5)
train_sents_brown_09 = tagged_sents_brown[:size_brown_09]
test_sents_brown_09 = tagged_sents_brown[size_brown_09:]
train_sents_brown_05 = tagged_sents_brown[:size_brown_05]
test_sents_brown_05 = tagged_sents_brown[size_brown_05:]

train_posts_nps_chat_09 = tagged_posts_nps_chat[:size_nps_chat_09]
test_posts_nps_chat_09 = tagged_posts_nps_chat[size_nps_chat_09:]
train_posts_nps_chat_05 = tagged_posts_nps_chat[:size_nps_chat_05]
test_posts_nps_chat_05 = tagged_posts_nps_chat[size_nps_chat_05:]

# Task a)
print("Task a)")
tags_brown = [tag for word, tag in brown.tagged_words()]
tags_nps_chat = [tag for word, tag in nps_chat.tagged_words()]

# Find most common tags
max_brown = nltk.FreqDist(tags_brown).max()  # NN
max_nps_chat = nltk.FreqDist(tags_nps_chat).max()  # UH

# Create default taggers
default_tagger_brown = nltk.DefaultTagger(max_brown)
default_tagger_nps_chat = nltk.DefaultTagger(max_nps_chat)

# Evaluate the Default taggers
print("Accuracy Brown default tagger 90/10: ",
      default_tagger_brown.evaluate(test_sents_brown_09))
default_tagger_brown.tag(train_sents_brown_05)
print("Accuracy Brown default tagger 50/50: ",
      default_tagger_brown.evaluate(test_sents_brown_05))
# init colorama
colorama.init()

# global constants
CONST_tagset = 'universal'

# global list of gold corpora
# C:\Users\admin\AppData\Roaming\nltk_data\corpora\
corp_names = [
    "brown", "nps_chat", "conll2000", "treebank", "twitter", "nhtsa_0",
    "nhtsa_1", "nhtsa_2", "nhtsa_3", "nhtsa_4", "nhtsa_5", "nhtsa_6"
]
corp_words_tagged = [
    brown.tagged_words(tagset=CONST_tagset),
    nps_chat.tagged_words(tagset=CONST_tagset),
    conll2000.tagged_words(tagset=CONST_tagset),
    treebank.tagged_words(tagset=CONST_tagset)
]
corp_words_untagged = [
    brown.words(),
    nps_chat.words(),
    conll2000.words(),
    treebank.words()
]
corp_sents_tagged = [
    brown.tagged_sents(tagset=CONST_tagset),
    nps_chat.tagged_posts(tagset=CONST_tagset),
    conll2000.tagged_sents(tagset=CONST_tagset),
    treebank.tagged_sents(tagset=CONST_tagset)
]