class NGram(object): ''' classdocs ''' def __init__(self): ''' Constructor ''' self.F = Filter() def Grams(self, pos, n=3, boundy=1): ''' ''' ngrams = [] for x in range(2, n): ngrams.append(self.Gram(pos, n=x)) tmp = [] for x in range(0, n-2): tmp = tmp + ngrams[x] return tmp def Gram(self, text, n=3, boundy=1): ''' @param text: text to be created into Ngrams @param n: Length of Ngrams @param boundy: Number of instiances of gram @return: List of ngrams of text ''' sentence = [nltk.ngrams(sent, n) for sent in text] t = [] for s in sentence: t = t + s freq = nltk.FreqDist(t) tmp = [] for f in freq.keys(): if int(freq[f]) > boundy: tmp.append(f) return tmp def NGramUn(self, text, n=3): sentance = nltk.sent_tokenize(text) sentance = [nltk.word_tokenize(self.F.strip(sent)) for sent in sentance] sentence = [nltk.ngrams(sent, n) for sent in sentance] return sentence def capitalList(self, text): ''' @param text: text input which has to be @return: List of tagged words which havve all capitalized first letters ''' tmp = [] for sent in text: count = 0 for word in sent: if (word[0][0].isupper() & count == 0) | (word[0][0].islower() & count > 0): t = [] for x in range(count, len(sent)): if sent[x][0][0].isupper(): t.append(sent[x]) else: if len(t) >= 2: tmp.append(t) t = [] break count = count + 1 return tmp
class POS(object): ''' Class for POS tagging, use POS tagger from NLTK. ''' def __init__(self): ''' Constructor inisiates the filter. Along with the Taggers which will be used, And loads the copora. ''' self.FF = Filter() try: #Attempt to open .plk file and load. input = open("./Corpus/Brown-Uni.pkl", 'rb') self.unigram_tagger = load(input) input.close() except IOError as e: self.brown_tagged_sents = nltk.corpus.brown.tagged_sents(simplify_tags=True) t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t0) t2 = nltk.BigramTagger(self.brown_tagged_sents, backoff=t1) self.unigram_tagger = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t2) output = open("./Corpus/Brown-Uni.pkl", 'wb') dump(self.unigram_tagger, output, -1) output.close() def POSTag(self, text, s='false'): ''' Method to POS tagged the Tokonized text. @param text: TOK text which is going to be POS tagged @param s: Whether is it a sentence of not. @return: POSTaged version of input ''' if s == 'false': sentance = nltk.sent_tokenize(text) sentance = [nltk.word_tokenize(self.FF.strip(sent)) for sent in sentance] sentance = [self.unigram_tagger.tag(sent) for sent in sentance] elif s == 'tok': sentance = [self.unigram_tagger.tag(sent,) for sent in text] else: sentance = self.unigram_tagger.tag(text) return sentance def POSNgram(self, text, s='false', n=3): ''' Method to POS tag N-grams @param text: n-grams to be POS tagged @param s: Whether is it a sentence of not. @param n: length of n gram @return: POS-Tagged n-grams ''' if s == 'false': sentance = self.POSTag(text); sentence = [nltk.ngrams(sent, n) for sent in sentance] else: sentence = [nltk.ngrams(sent, n) for sent in text] return sentence
class POS(object): ''' Class for POS tagging, use POS tagger from NLTK. ''' def __init__(self): ''' Constructor inisiates the filter. Along with the Taggers which will be used, And loads the copora. ''' self.FF = Filter() try: #Attempt to open .plk file and load. input = open("./Corpus/Brown-Uni.pkl", 'rb') self.unigram_tagger = load(input) input.close() except IOError as e: self.brown_tagged_sents = nltk.corpus.brown.tagged_sents( simplify_tags=True) t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t0) t2 = nltk.BigramTagger(self.brown_tagged_sents, backoff=t1) self.unigram_tagger = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t2) output = open("./Corpus/Brown-Uni.pkl", 'wb') dump(self.unigram_tagger, output, -1) output.close() def POSTag(self, text, s='false'): ''' Method to POS tagged the Tokonized text. @param text: TOK text which is going to be POS tagged @param s: Whether is it a sentence of not. @return: POSTaged version of input ''' if s == 'false': sentance = nltk.sent_tokenize(text) sentance = [ nltk.word_tokenize(self.FF.strip(sent)) for sent in sentance ] sentance = [self.unigram_tagger.tag(sent) for sent in sentance] elif s == 'tok': sentance = [self.unigram_tagger.tag(sent, ) for sent in text] else: sentance = self.unigram_tagger.tag(text) return sentance def POSNgram(self, text, s='false', n=3): ''' Method to POS tag N-grams @param text: n-grams to be POS tagged @param s: Whether is it a sentence of not. @param n: length of n gram @return: POS-Tagged n-grams ''' if s == 'false': sentance = self.POSTag(text) sentence = [nltk.ngrams(sent, n) for sent in sentance] else: sentence = [nltk.ngrams(sent, n) for sent in text] return sentence