def load_data(self, percentage):
        print("Started Loading the Data")
        # Get the complete data
        data_set = treebank.fileids()
        # Partition the data into train and test data sets
        training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)]
        testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)]

        # How much percentage of files consider for training?
        index = int(percentage*len(training_data_fileIds))
        training_data_fileIds = training_data_fileIds[:index]

        tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds)
        tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds)

        tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds)
        tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds)

        # print(len(tagged_training_data1), len(tagged_testing_data1))

        # UnTag the data for other uses
        untagged_training_data = [untag(item) for item in tagged_training_data]
        untagged_testing_data = [untag(item) for item in tagged_testing_data]

        print("Data Loaded Successfully. Stats are")
        print("Training Data Sentences: ", len(tagged_training_data))
        print("Testing Data  Sentences: ", len(tagged_testing_data))

        return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
Exemple #2
0
def chunker(parsedData):
    """
    Extract the grammar rules from the input parsed text and assign
    each rule with the probability of it occuring in the parsed text.
    """
    tags_words = treebank.tagged_words()

    # This is the list where all the rules will be stored, for
    # construction of the PCFG
    rules = []
    NP = Nonterminal('NP')
    rhs_rules = []

    # Extract the rules from the training-data
    for sent in parsedData:
        for production in sent.productions():
            rules.append(production)

    # Add the lexical rules
    for word, tag in tags_words:

        # For each tagged word, create a tree containing that
        # lexical rule
        # This is to be able to add it to the list rules
        t = Tree.fromstring("(" + tag + " " + word + ")")
        for production in t.productions():
            rules.append(production)

    # All the syntactic rules and all of the lexical rules
    # are extracted from the training-data
    # Here the PCFG is extracted
    rules_prob = nltk.grammar.induce_pcfg(Nonterminal('S'), rules)
    return rules_prob
Exemple #3
0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.items[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip()

    # Print the results.
    print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
    print original
    print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
    print results
    print '*' * 70
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.files()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*'*70)
Exemple #5
0
    def process_basic(self):
        word_list = list(treebank.tagged_words())
        transaction_dict = {}
        emmision_dict = {}
        pos_tag_dict = {}
        for i in range(1, len(word_list), 2):
            word_tuples = (word_list[i - 1], word_list[i])
            tag1, tag2 = word_list[i - 1][1], word_list[i][1]

            if not pos_tag_dict.get(word_list[i - 1][1], 0):
                pos_tag_dict[word_list[i - 1][1]] = 0
            if not pos_tag_dict.get(word_list[i][1], 0):
                pos_tag_dict[word_list[i][1]] = 0
            pos_tag_dict[word_list[i - 1][1]] += 1
            pos_tag_dict[word_list[i][1]] += 1

            if not transaction_dict.get(
                (word_tuples[0][1], word_tuples[1][1]), 0):
                transaction_dict[(word_tuples[0][1], word_tuples[1][1])] = 0
            transaction_dict[(word_tuples[0][1], word_tuples[1][1])] += 1

            if not emmision_dict.get(word_tuples[0], 0):
                emmision_dict[word_tuples[0]] = 0
            emmision_dict[word_tuples[0]] += 1

            if not emmision_dict.get(word_tuples[1], 0):
                emmision_dict[word_tuples[1]] = 0
            emmision_dict[word_tuples[1]] += 1

        transaction_state_arr = self.get_transition_probabilty(
            pos_tag_dict, transaction_dict)
        emmision_state_arr = self.get_emmision_probability(
            pos_tag_dict, emmision_dict)
        for each in emmision_state_arr[::-1]:
            print("\t\t each", each)
Exemple #6
0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.fileids()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = " ".join(stemmed)
    results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()

    # Convert the original to a string, and word wrap it.
    original = " ".join(orig)
    original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()

    # Print the results.
    print("-Original-".center(70).replace(" ", "*").replace("-", " "))
    print(original)
    print("-Results-".center(70).replace(" ", "*").replace("-", " "))
    print(results)
    print("*" * 70)
Exemple #7
0
def test_sentences(grammar):

    for t in test:
        print("Processing: " + str(t))
        reference = list(treebank.tagged_words(t))

        tokens = list(treebank.words(t))

        print("fixing grammar.....")
        # Checks if grammar covers all words in the sentence and adds them to the grammar if necessary
        fixed_grammar = get_fixed_grammer(grammar, tokens)

        print("fixed grammar")
        print("Building Parser....")
        parser = ViterbiParser(fixed_grammar)

        print("Parsing...")
        #Gets list of all possible trees, the most likely tree is at index 0
        start = time.time()
        parses = parser.parse_all(tokens)
        print("Time")
        print(start - time.time())

        #Getting POS tags from parser tree
        leafs = parses[0].pos()

        #Calculating accuracy of Parser results
        correct_tags = 0.0
        for i in range(len(leafs)):
            if leafs[i] == reference[i]:
                correct_tags += 1.0


        print(str(correct_tags/len(leafs)))
Exemple #8
0
def createPOSList():
    pos_map = dict()
    for a in treebank.tagged_words():
        word = a[0]
        if word not in pos_map:
            pos_map[word] = [posmap[a[1]]]
        else:
            pos_map[word] = list(set(list(posmap[a[1]]) + list(pos_map[word])))
    return pos_map
Exemple #9
0
def createPOSList():
    pos_map = dict()
    for a in treebank.tagged_words():
        word = a[0]
        if word not in pos_map:
            pos_map[word] = [posmap[a[1]]]
        else:
            pos_map[word] = list(set(list(posmap[a[1]]) + list(pos_map[word])))
    return pos_map
Exemple #10
0
def preprocess_corpora():
	brown_words = brown.tagged_words(simplify_tags=True)
	treebank_words = treebank.tagged_words(simplify_tags=True)
	'''
	#this takes forever.
	bwog_corpus = nltk.corpus.PlaintextCorpusReader('../bwog-corpus-txt', '.*\.txt')
	bwog_sents = bwog_corpus.sents(bwog_corpus.fileids())
	bwog_words = []
	for s_i in xrange(0, len(bwog_sents)/100000):
		#TODO: skip punctuation
		simp_tagged_sent = [(word,simp_tag(tag)) for word,tag in nltk.pos_tag(bwog_sents[s_i])]
		bwog_words.extend(simp_tagged_sent)
	'''
	all_tagged_words = brown_words + treebank_words #+ bwog_words
	all_sents = brown.sents() + treebank.sents() #+ bwog_sents
	compute_concordance(all_tagged_words)
Exemple #11
0
    def convert_format(self):
        sentences = []
        words = pkl.load(open("LM_corpura//%s//%s" % (cfg['lm_corpus'], cfg['corpus__dict_file']), 'rb'))
        word_dict = dict([(word, key) for key, word in enumerate(words, 1)])
        #word_dict = common.get_word_dict(self.conf['index2word_path'])

        if self.name == "pos_tagging":
            tags = set([tag for word, tag in treebank.tagged_words()])
            tag_index = {tag: idx for idx, tag in enumerate(tags, 1)}
            for sentence in treebank.tagged_sents():
                sent_words = [(word_dict[common.tokenize(w)], tag_index[t])
                              if common.tokenize(w) in word_dict else (0, tag_index[t])
                              for w, t in sentence]
                sentences.append(sent_words)

        return sentences
Exemple #12
0
def get_words():
    """
        Returns list of words from nltk treebank
    """
    import nltk

    nltk.download("treebank")
    from nltk.corpus import treebank

    word_ls = []
    for item in treebank.fileids():
        for (word, tag) in treebank.tagged_words(item):
            # assuming the words are allready lowered
            word = word.lower()
            word_ls.append(word)

    word_ls = list(set(word_ls))
    return word_ls
Exemple #13
0
def pos_tag(sentence, verbose=False):
    from nltk.corpus import treebank
    treebankDict = {}
    for (word, tag) in treebank.tagged_words():
        treebankDict[word] = tag

    words = tokenize.WhitespaceTokenizer().tokenize(sentence)
    tagged_words = []
    for word in words:
        try:
            tag = {
                'a': 'ex_quant',
                'an': 'ex_quant',
                'every': 'univ_quant'
            }[word.lower()]
        except:
            try:
                tag = treebankDict[word]
            except:
                raise KeyError('\'%s\' is not in the Part-of-Speech lookup' %
                               word)
        tagged_words.append((word, tag))
    return tagged_words
Exemple #14
0
def demo():
    from nltk.corpus import treebank 
    #from nltk.probability import LidstoneProbDist
    #from nltk.probability import WittenBellProbDist
    from nltk.probability import SimpleGoodTuringProbDist
    from nltk.model import NgramModel
    estimator = lambda fdist, bins: SimpleGoodTuringProbDist(fdist, len(fdist)+1) 
    #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
    #estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2) 
    
    tag_corpus = []
    for (word,tag) in treebank.tagged_words():
        tag_corpus.append(tag)
    lm = NgramModel(2, tag_corpus, estimator) 
    print lm 
    lm1 = NgramModel(1, tag_corpus, estimator) 
    print lm1 
    print tag_corpus[:20]

    sent = "NN"
    print lm1.entropy(sent) 
    
    sent = "DT "
    print lm1.entropy(sent) 

    sent = "VBZ"
    print lm1.entropy(sent) 
    
    sent = "JJ"
    print lm1.entropy(sent) 
    
    sent = "RB"
    print lm1.entropy(sent) 
    
    sent = "DT NN"
    print lm.entropy(sent) 
def main():
    tr=treebank.raw()
    tg=treebank.tagged_words()
    
##print([tag[0] for tag in treebank.tagged_words(tagset='universal')])
    
    tags=[tag[1] for tag in treebank.tagged_words(tagset='universal')]
  
    df = pd.read_csv('imdb_master.csv', encoding = 'ISO-8859-1')
    df=pd.DataFrame(df)
##    print (df.head())

    df2=df.loc[1:,['type','review','label','file']]
##    print (df2.head())
##    rev=df.loc[1:,['review']]
##    lab=df.loc[1:,['label']]
##    print(rev.head())
##    print(lab.head())
    rev_lab=df2[['review','label']]


    rev_lab=low(rev_lab)
    rev_lab=tok(rev_lab)
    rev_lab=call_clean(rev_lab)
##    print(rev_lab.head())
##
    rev_lab_non_stop=restop(rev_lab)

    rev_lab_non_stop_c=call(rev_lab_non_stop)
##    print(rev_lab_non_stop_c.head())


    lem_rev_lab=lemtize(rev_lab_non_stop_c)
    

    stem_rev_lab=stemma(lem_rev_lab)
##
##    print(lem_rev_lab.head())
##
##    print(stem_rev_lab.head())
    fd=nltk.FreqDist(tags)
    Comm=fd.most_common()
    tagged_revs=pos_t(stem_rev_lab)
    print(tagged_revs.head())
    nwf_pos=[]
    nwt_pos=[]
    nw_pos=[]
    for rw in tagged_revs['POS']:
        
        for (k,v) in rw:
##            print(k)
##
            nw_pos.append(str(k)+'_'+str(v))
##        nwt_pos.append(nw_pos)
##        nw_pos=[]
##    nwf_pos.append(nwt_pos)
##    nwt_pos=[]
##    print(nwf_pos[1])
##    nw_pos=' '.join(nw_pos)

##    tagged_revs['newpos']=nwf_pos
##    print(tagged_revs.head())
            
    fit_vec(stem_rev_lab)
Exemple #16
0
from nltk.probability import FreqDist
from nltk.corpus import treebank

fd = FreqDist()
for word, tag in treebank.tagged_words():
    fd[tag] += 1
tags = list(fd.items())
tags.sort(key=lambda (tag, freq): tag)
for tag, freq in tags:
    print('{0}\t\t\t{1}'.format(tag, freq))
Exemple #17
0
from nltk.corpus import treebank

file = input()
print(treebank.tagged_words(file)[0])
Exemple #18
0
# 	if i in token:
# 		x.append('digit_yes')
# 	else:
# 		x.append('digit_no')
# x_train.append(x)

#case11 Word contains a digit, current word, word contains a capital letter and previous word
x_train = []
y_train = []
list1 = [
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
    'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
]
list2 = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
pre_token = 'first'
for token, pos in treebank.tagged_words():
    # for token, pos in treebank.tagged_words()[:80000]:
    x = []
    y_train.append(pos)
    x.append(pre_token + '-' + token)
    x.append(token)
    pre_token = token
    for item in list1:
        if item in token:
            x.append('capital_yes')
        else:
            x.append('capital_no')
    for i in list2:
        if i in token:
            x.append('digit_yes')
        else:
Exemple #19
0
        f.write('%s\n' % word)

with open('treebank_words.txt', 'w') as f:
    for word in treebank.words():
        f.write('%s\n' % word)

tc_tags = []
for t in tc['tagged_words']:
    tc_tags.append(t[1])

with open('tc_tags.txt', 'w') as f:
    for tag in tc_tags:
        f.write('%s\n' % tag)

treebank_tags = []
for t in treebank.tagged_words():
    treebank_tags.append(t[1])

with open('treebank_tags.txt', 'w') as f:
    for tag in treebank_tags:
        f.write('%s\n' % tag)

with open('tc_sent_lengths.txt', 'w') as f:
    for sent in tc['sents']:
        f.write('%s\n' % len(sent))

with open('treebank_sent_lengths.txt', 'w') as f:
    for sent in treebank.sents():
        f.write('%s\n' % len(sent))

#tc_tags_series = pd.Series(tc_tags)
    def __init__(self):
        self.wordToTags = defaultdict(set)
        convertedTaggedWords = [(w,nltk.tag.mapping.map_tag('en-ptb', 'universal', t)) for w,t in treebank.tagged_words()]
        for word, tag in convertedTaggedWords:
            self.wordToTags[word].add(tag)

        productions = list()
        S = nltk.Nonterminal('S')
        for tree in treebank.parsed_sents():
            productions += tree.productions()
        # create the grammar
        pcfg = nltk.induce_pcfg(S, productions)
        # print(pcfg)
        self.viterb = ViterbiParser(pcfg)
        self.mostRecentTree = None
        self.validPosTags = set()
        self.validChunkTags = set()
        self.validIOBTags = set()
        self.relationTags = set()
        self.anchorTags = set()

        # pos tags
        self.validPosTags.add("CC")
        self.validPosTags.add("CD")
        self.validPosTags.add("DT")
        self.validPosTags.add("EX")
        self.validPosTags.add("FW")
        self.validPosTags.add("IN")
        self.validPosTags.add("JJ")
        self.validPosTags.add("JJR")
        self.validPosTags.add("JJS")
        self.validPosTags.add("LS")
        self.validPosTags.add("MD")
        self.validPosTags.add("NN")
        self.validPosTags.add("NNS")
        self.validPosTags.add("NNP")
        self.validPosTags.add("NNPS")
        self.validPosTags.add("PDT")
        self.validPosTags.add("POS")
        self.validPosTags.add("PRP")
        self.validPosTags.add("PRP$")
        self.validPosTags.add("PR")
        self.validPosTags.add("PBR")
        self.validPosTags.add("PBS")
        self.validPosTags.add("RP")
        self.validPosTags.add("SYM")
        self.validPosTags.add("TO")
        self.validPosTags.add("UH")
        self.validPosTags.add("VB")
        self.validPosTags.add("VBZ")
        self.validPosTags.add("VBP")
        self.validPosTags.add("VBD")
        self.validPosTags.add("VBG")
        self.validPosTags.add("WDT")
        self.validPosTags.add("WP")
        self.validPosTags.add("WP$")
        self.validPosTags.add("WRB")
        self.validPosTags.add(".")
        self.validPosTags.add(",")
        self.validPosTags.add(":")
        self.validPosTags.add("(")
        self.validPosTags.add(")")

        # chunk tags
        self.validChunkTags.add("NP")
        self.validChunkTags.add("PP")
        self.validChunkTags.add("VP")
        self.validChunkTags.add("ADVP")
        self.validChunkTags.add("ADJP")
        self.validChunkTags.add("SBAR")
        self.validChunkTags.add("PRT")
        self.validChunkTags.add("INTJ")
        self.validChunkTags.add("PNP")

        # IOB tags
        self.validIOBTags.add("I-")
        self.validIOBTags.add("O-")
        self.validIOBTags.add("B-")

        # relation tags
        self.relationTags.add("SBJ")
        self.relationTags.add("OBJ")
        self.relationTags.add("PRD")
        self.relationTags.add("TMP")
        self.relationTags.add("CLR")
        self.relationTags.add("LOC")
        self.relationTags.add("DIR")
        self.relationTags.add("EXT")
        self.relationTags.add("PRP")

        # anchor tags
        self.anchorTags.add("A1")
        self.anchorTags.add("P1")
Exemple #21
0
from nltk.corpus import brown as brown
from nltk.corpus import treebank as treebank
import json
import re

brown_tagged_words = brown.tagged_words(simplify_tags=True)
print "brown tags retrieved"

treebank_tagged_words = treebank.tagged_words(simplify_tags=True)
print "treebank tags retrieved"

all_tagged_words = brown_tagged_words + treebank_tagged_words
all_tagged_words = [(tuple[0].lower(), tuple[1]) for tuple in all_tagged_words]
print "all_tags retrieved"

vocab = {}

for char in ".abcdefghijklmnopqrstuvwxyz":
    vocab[char] = {}

for i, (current_word, current_tag) in enumerate(sorted(all_tagged_words)):
    
    if current_word[0] in "abcdefghijklmnopqrstuvwxyz":
        char = current_word[0]
    else:
        char = '.'

    if vocab[char].has_key(current_word):
        try:
            g = [pos[0] for pos in vocab[char][current_word]].index(current_tag)
            vocab[char][current_word][g] = (current_tag, vocab[char][current_word][g][1]+1)
print reader.paras()
print reader.tagged_paras()


#TaggedCorpus uses default tokenizer but we can change it by customizing it
from nltk.tokenize import SpaceTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer())
print reader.words()

#Customing TaggedCorpus's sentence tokenizer
from nltk.tokenize import LineTokenizer
reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer())
print reader.words()

#Customizing TaggedCorpus's paragraph Block reader
#Customizing TaggedCorpus's tag separator - Pg 57

###To map a corpus tags to the universal tagset, the corpus reader must be initialized
#with a known tagset name. Then you can pass tagset="universal" to the method.
reader=TaggedCorpusReader(root,r'.*\.pos',tagset='en-brown')
reader.tagged_words(tagset="universal")

#Ex:
from nltk.corpus import treebank
treebank.tagged_words()
treebank.tagged_words(tagset="universal")
#If we try to map an unknown mapping or tagset then everyword will be tagged with "UNK"
treebank.tagged_words(tagset="brown")


Exemple #23
0
# tokenize words
word_tokenizer = TreebankWordTokenizer()
word_list = [word_tokenizer.tokenize(sent) for sent in article_sent]

# train pos tagger
# evaluate accuracy
test_sents = treebank.tagged_sents()[3000:]
test_chunks = treebank_chunk.chunked_sents()[3000:]
conll_test = conll2000.chunked_sents('test.txt')

train_new_tagger = False 
if train_new_tagger:
  train_sents = treebank.tagged_sents()[:3000]
  #create dictionary from treeback of most frequent words
  print("creating dictionary from treeback")
  model = word_tag_model(treebank.words(), treebank.tagged_words())
  
  #keeping tagger default for chaining purposes
  print("Training tagger")
  
  backoff= DefaultTagger('NN')
  nt = NamesTagger(backoff=backoff)
  #taggers = [UnigramTagger, BigramTagger, TrigramTagger]
  #trained_taggers = backoff_tagger(train_sents,taggers,backoff=nt)
  #Regexp - best to treat numbers? 
  regexp_tagger = RegexpTagger(patterns, backoff=nt)
  treebank_tagger = UnigramTagger(model=model,backoff=regexp_tagger)

  #skipping affix
  
  #skipping brill
Exemple #24
0
    return tag


# Return the POS of a rule (used for list sorting)
def get_key(rule):
    return rule.split()[1]


if __name__ == '__main__':
    # Get allowed words
    allowed_words_file = open('../../allowed_words.txt', 'r')
    allowed_words = allowed_words_file.read().split('\n')

    # Tagged words from corpora
    treebank_tagged_words = list(set(treebank.tagged_words()))
    conll2000_tagged_words = list(set(conll2000.tagged_words()))
    brown_tagged_words = list(set(brown.tagged_words()))
    nps_tagged_words = list(set(nps_chat.tagged_words()))

    vocab_rules = []
    unvocabbed_words = []

    # Find tags that occur with allowed words in the corpora
    for word in allowed_words:
        curr_tags = get_tags_linear(word, treebank_tagged_words)

        if not curr_tags:
            curr_tags = get_tags_linear(word, conll2000_tagged_words)

        if not curr_tags:
Exemple #25
0
lm.entropy(test_words)

# <markdowncell>

# ### Counting
# 
# For example, how many words in a corpus are not in WordNet?

# <codecell>

from nltk.corpus import wordnet
from nltk.probability import ConditionalFreqDist

cfd = ConditionalFreqDist(
      (pos, len(wordnet.synsets(word)) > 0) for word,pos in treebank.tagged_words()
)

cfd.tabulate()

# <markdowncell>

# ### Missing functionality
# 
# #### Head word identification
# 
# NLTK has no functionality to identify the head words of phrases. In this noun phrase, 'man' is the head word,
# but it is not straightforward to identify it.

# <codecell>
Exemple #26
0
from nltk.corpus import treebank
tb_tagged_sents = treebank.tagged_sents()
tb_sents = treebank.sents()

patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default)
]

tb_tags = [tag for (word, tag) in treebank.tagged_words()]
tb_tag = nltk.FreqDist(tb_tags)
#b) i)
print(tb_tag.most_common())
print(len(tb_sents))

#uni
n = int(len(tb_tagged_sents) * 0.1)
uni_train_sents = tb_tagged_sents[n:]
uni_test_sents = tb_tagged_sents[:n]

unigram_tagger = nltk.UnigramTagger(uni_train_sents)
uni_accuracy = unigram_tagger.evaluate(uni_test_sents)
print("Unigram Accuracy: ", uni_accuracy)

#bi
"""句法分析-形式语言与自动机"""
import nltk
from nltk import FreqDist, Nonterminal, nonterminals, Production
from nltk.corpus import treebank, sinica_treebank
from nltk.grammar import toy_pcfg2

print(str(nltk.corpus.treebank).replace('\\\\', '/'))
out = treebank.fileids()
print(out)
print(treebank.words('wsj_0007.mrg'))
print(treebank.tagged_words('wsj_0007.mrg'))
print(treebank.parsed_sents('wsj_0007.mrg')[2])
# 语法树
# treebank_chunk.chunked_sents()[1].draw()
# out = treebank_chunk.chunked_sents()[1].leaves()
# out = treebank_chunk.chunked_sents()[1].pos()
# out = treebank_chunk.chunked_sents()[1].productions()
# print(out)
fd = FreqDist()
fd.items()
print(sinica_treebank.sents())
print(sinica_treebank.parsed_sents()[27])
"""上下文无关文法(Context-free Grammar, CFG)
参考wiki 
自动机理论 https://zh.wikipedia.org/zh-cn/%E8%87%AA%E5%8B%95%E6%A9%9F%E7%90%86%E8%AB%96
在计算机科学中,若一个形式文法 G = (V, Σ, P, S) 的产生式规则都取如下的形式:A -> α,则谓之。其中 A∈V ,α∈(V∪Σ)* 。
上下文无关文法取名为“上下文无关”的原因就是因为字符 A 总可以被字符串 α 自由替换,而无需考虑字符 A 出现的上下文。
一个CFG由以下部分组成:
    非终结符的有限集合(N)
    终结符的有限集合(T)
    开始符号(S)
Exemple #28
0
import nltk
import nltk.corpus
print(str(nltk.corpus.treebank).replace('\\\\','/'))
print(nltk.corpus.treebank.fileids())
from nltk.corpus import treebank
print(treebank.words('wsj_0007.mrg'))
print(treebank.tagged_words('wsj_0007.mrg'))

Exemple #29
0
def _get_data():
    return _split_tagged_words(treebank.tagged_words())
tags = []
distinct_tags = []
# dictionary mapping pos tag name to numeric id
tag_index = {}
# id of corresponding tag
tag_id = []
# represent pos tags in y_train/y_test with integers
y_train_in_integers = []
y_test_in_integers = []
# define window size
window_size = 3

print('Phrase 1: Dividing data into training set and testing set...')
train = []
test = []
train = treebank.tagged_words()[:90677]
test = treebank.tagged_words()[90677:]
# print(len(treebank.tagged_words()))
# print(treebank.tagged_words())
# print(train)
# print(test)
print('Phrase 1: END\n')

X_train, y_train = chop_words_into_windows(window_size, train)
X_test, y_test = chop_words_into_windows(window_size, test)
# test if chop_words_into_windows is correctly implemented
# for i in range(0, 50):
#     print(X_train[i])
#     print(y_train[i])
#     print('*****')
Exemple #31
0
from nltk.grammar import Nonterminal
from nltk.corpus import treebank

training_set = treebank.parsed_sents()

print training_set[1]

# extract the productions for all annotated training sentences
treebank_productions = list(
    set(production for sent in training_set for production in sent.productions())
)

treebank_productions[0:10]

# add productions for each word, POS tag
for word, tag in treebank.tagged_words():
    t = nltk.Tree.fromstring("(" + tag + " " + word + ")")
    for production in t.productions():
        treebank_productions.append(production)

# build the PCFG based grammar
treebank_grammar = nltk.grammar.induce_pcfg(
    Nonterminal('S'),
    treebank_productions
)

# build the parser
viterbi_parser = nltk.ViterbiParser(treebank_grammar)

# get sample sentence tokens
tokens = nltk.word_tokenize(sentence)
Exemple #32
0
# The most frequent nouns usually provide information on the subject of a text. Below, the most frequent nouns of an already tagged text of the *Treebank*-corpus are determined. Let's see if we can conclude the text's subject.  

# In[20]:


from nltk.corpus import treebank
from nltk import FreqDist
from nltk import bigrams

print("\nTreebank sentences: ", treebank.sents(fileids="wsj_0003.mrg"))


# In[21]:


tagged0003=treebank.tagged_words(tagset="universal",fileids="wsj_0003.mrg")
print("File tagged0003: ",tagged0003)


# In[22]:


fdist=FreqDist(a[0].lower() for a in tagged0003 if a[1]=="NOUN")
#fdist.tabulate(20)
print(fdist.most_common(20))
freqNouns = [w[0] for w in fdist.most_common(20)]
fdist.plot(20)


# Next, the adjectives immediately before the most frequent nouns are determined. What can be concluded from them? 
Exemple #33
0
# print brown.sents(categories=['news', 'editorial', 'reviews'])
print('POS TAGS:')
print('WORDS:')
print(brown.words()[:5])
print(brown.tagged_words()[:5])
print('SENTS:')
print([s[:5] for s in brown.sents()[:5]])
print([s[:5] for s in brown.tagged_sents()[:5]])
print()

# CHUNKED
# The CoNLL 2000 Corpus includes phrasal chunks
# The CoNLL 2002 Corpus includes named entity chunks
print('CHUNKING & NER:')
print(conll2000.fileids())
print(conll2000.sents()[0])
print(conll2000.chunked_sents()[0])
print(conll2002.sents()[0])
print(conll2002.chunked_sents()[0])
print()

# PARSED
# 10% sample of the Penn Treebank
print('PENN TREEBANK:')
print(treebank.fileids()[:5])
print(treebank.words('wsj_0001.mrg')[:10])
print(treebank.tagged_words('wsj_0001.mrg')[:10])
print(treebank.sents('wsj_0001.mrg')[0])
print(treebank.parsed_sents('wsj_0001.mrg')[0])
print()
Exemple #34
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
Exemple #35
0
from nltk import UnigramTagger
from nltk.corpus import treebank

from tag_util import word_tag_model

model = word_tag_model(treebank.words(), treebank.tagged_words())
tagger = UnigramTagger(model=model)

test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))
Exemple #36
0
    output_list = list()
    list_of_one_match = list()
    i = 0
    for pair in tagged_tokens:
        if i == len(tag_seq):
            output_list.append(list_of_one_match)
            list_of_one_match = []
            i = 0
        elif pair[1] == tag_seq[i]:
            list_of_one_match.append(pair)
            i += 1
        else:
            list_of_one_match = []
            i = 0
    print(output_list[:10])


dict = dict()
list_of_tuples = brown.tagged_words(tagset='universal')
for el in list_of_tuples:
    if el[0] not in dict.keys():
        dict[el[0]] = [el[1]]
    else:
        if el[1] not in dict[el[0]]:
            dict[el[0]].append(el[1])
sorted_x = sorted(dict.items(), key=lambda kv: len(kv[1]), reverse=True)
for i in range(10):
    print(sorted_x[i])

search_by_tag_sequence(["NNP", "NNP", "NNP"], treebank.tagged_words())
Exemple #37
0
import nltk
#from nltk.book import *
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk import word_tokenize
from nltk import hmm
#nltk.help.upenn_tagset("NN*")
files = treebank.fileids()
#print(files)
t = treebank.tagged_words("wsj_0003.mrg")
#for p in t:
#print(p)

#race1 = nltk.tag.str2tuple('race/NN')
#race2 = nltk.tag.str2tuple('race/VB')
#print(race1)

#print(brown.tagged_words().count(race1))
#print(brown.tagged_words().count(race2))

unitag = nltk.tag.UnigramTagger(brown.tagged_sents(categories='news')[:5000])
print(unitag)
s = "The secretariat is expected to race tomorrow."
s_tok = word_tokenize(s)
tt = unitag.tag(s_tok)
print(tt)

hmmTagger = nltk.hmm.HiddenMarkovModelTrainer().train_supervised(
    brown.tagged_sents(categories="news")[:5000])
tt2 = hmmTagger.tag(s_tok)
print(tt2)
Exemple #38
0
    lam1 = lam1 / total
    lam2 = lam2 / total
    lam3 = lam3 / total

    return lam1, lam2, lam3


# OOV

#Use letter information to predict tags

from nltk.corpus import treebank
from collections import Counter
import random

tagged_words = treebank.tagged_words()
tagged_sentences = treebank.tagged_sents()

cut_off = int(len(tagged_sentences) * 0.8)
train = tagged_sentences[:cut_off]
test = tagged_sentences[cut_off:]
train_words = tagged_words[:80000]
test_words = tagged_words[80000:]


def unigram_tagger(train, test):
    word_to_all_pos = {}

    for word, pos in train:
        if word not in word_to_all_pos:
            word_to_all_pos[word] = [pos]
Exemple #39
0
#Other tagged corpora also come with the tagged_words method.
#Note that the chat corpus is tagged with Penn Treebank POS tags.

nltk.corpus.nps_chat.tagged_words()[:50]

#In this class, we will mostly use the Penn Treebank tag set,
#as it is the most widely used.  The Treebank has the tagged_words
#and tagged_sents methods, as well as the words method that we used
#before to get the tokens.

from nltk.corpus import treebank

treebank_tokens = treebank.words()
treebank_tokens[:50]

treebank_tagged_words = treebank.tagged_words()[:50]
len(treebank.tagged_words())
treebank_tagged_words[:50]  #maybe an error here?

treebank_tagged = treebank.tagged_sents()[:2]
len(treebank.tagged_sents())
treebank_tagged[:2]

#The NLTK has almost 4,000 sentences of tagged data from
#Penn Treebank, while the actual Treebank has much more.
#This will limit the accuracy of the parsers that we can define in
#lab, but also make the running times short enough for labs.

############Let's look at the frequencies of the tags in this portion of
#Penn Treebank.  To do that, we use the NLTK Frequency
#Distribution for all the tags from the (word, tag) pairs in the Treebank.
import nltk
from nltk.corpus import treebank	
treebank_tagged = treebank.tagged_words(tagset='universal')
tagpairs = nltk.bigrams(treebank_tagged)
preceders_noun = [x[1] for (x, y) in tagpairs if y[1] == 'NOUN']
freqdist = nltk.FreqDist(preceders_noun)
print([tag for (tag, _) in freqdist.most_common()])
Exemple #41
0
import nltk
from nltk.corpus import brown
from nltk.corpus import treebank
print(brown.tagged_sents()[:2])
print(brown.tagged_words()[:50])
wordtag = brown.tagged_words()[0]
brown_humor_tagged = brown.tagged_words(categories='humor', tagset='universal')
print(brown_humor_tagged[:50])
a = nltk.corpus.nps_chat.tagged_words()[:50]
print(a)

treebank_tokens = treebank.words()
print("treebank_tokens ", treebank_tokens)
treebank_tagged_words = treebank.tagged_words()[:50]

print("tree tagged", treebank_tagged_words[:50])

treebank_tagged = treebank.tagged_sents()[:2]
print(treebank_tagged[:2])

tag_fd = nltk.FreqDist(tag for (word, tag) in treebank_tagged_words)
for tag, freq in tag_fd.most_common():
    print(tag, freq)
Exemple #42
0
#!/usr/bin/env python3

from nltk import FreqDist
# import treebank
from nltk.corpus import treebank
# import ne chunker
from nltk import chunk, tag

tempList = [
    "DATE", "TIME", "GPE", "FACILITY", "LOCATION", "MONEY", "PERSON",
    "ORGANIZATION", "PERCENT"
]

data = treebank.tagged_words()  # load treebank data
chunkd_data = chunk.ne_chunk(data)  # chunk the data
chunkd_trees = chunkd_data.subtrees(
    filter=lambda t: t.label() in tempList)  # select subtrees which are NE

word_fd = FreqDist(
    [' '.join(word for word, pos in tree.leaves()) for tree in chunkd_trees])
print("Three most common named entities are: ")

print(', '.join(word for word, freq in word_fd.most_common(3)))
# Three most common named entities are:
# U.S., New York, Japanese
__author__ = 'rumesh'

import nltk
from nltk.corpus import brown
# print brown.tagged_sents()[:2]

# print brown.tagged_words()[:50]

brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
# print brown_news_tagged[:50]

# print nltk.corpus.nps_chat.tagged_words()[:50]
from nltk.corpus import treebank
# print treebank.tagged_words()[:50]
# print len(treebank.tagged_words())
# print treebank.tagged_sents()[:2]
# print len(treebank.tagged_sents())

def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag,word) for (word,tag) in tagged_text if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].keys()[:20]) for tag in cfd.conditions())

tagdict = findtags('NN', treebank.tagged_words())
for tag in sorted(tagdict):
    print tag, tagdict[tag]