コード例 #1
ファイル: NGram.py プロジェクト: danjamker/N-Fly
class NGram(object):

    def __init__(self):
        self.F = Filter()
    def Grams(self, pos, n=3, boundy=1):
        ngrams = []
        for x in range(2, n):
            ngrams.append(self.Gram(pos, n=x))
        tmp = []
        for x in range(0, n-2):
            tmp = tmp + ngrams[x]
        return tmp
    def Gram(self, text, n=3, boundy=1):
        @param text: text to be created into Ngrams
        @param n: Length of Ngrams
        @param boundy: Number of instiances of gram
        @return: List of ngrams of text
        sentence = [nltk.ngrams(sent, n) for sent in text]
        t = []
        for s in sentence:
            t = t + s
        freq = nltk.FreqDist(t)
        tmp = []
        for f in freq.keys():
            if int(freq[f]) > boundy:
        return tmp
    def NGramUn(self, text, n=3):

        sentance = nltk.sent_tokenize(text)     
        sentance = [nltk.word_tokenize(self.F.strip(sent)) for sent in sentance]  
        sentence = [nltk.ngrams(sent, n) for sent in sentance]
        return sentence
    def capitalList(self, text):
        @param text: text input which has to be 
        @return: List of tagged words which havve all capitalized first letters
        tmp = []
        for sent in text:
            count = 0
            for word in sent:
                if (word[0][0].isupper() & count == 0) | (word[0][0].islower() & count > 0):
                    t = []
                    for x in range(count, len(sent)):
                        if  sent[x][0][0].isupper():
                            if len(t) >= 2:
                            t = []
                count = count + 1
        return tmp
コード例 #2
ファイル: POS.py プロジェクト: danjamker/N-Fly
class POS(object):
    Class for POS tagging, use POS tagger from NLTK.

    def __init__(self):
        Constructor inisiates the filter. Along with the Taggers which will be used,
        And loads the copora. 
        self.FF = Filter()
            #Attempt to open .plk file and load. 
            input = open("./Corpus/Brown-Uni.pkl", 'rb')
            self.unigram_tagger = load(input)
        except IOError as e:   
            self.brown_tagged_sents = nltk.corpus.brown.tagged_sents(simplify_tags=True)
            t0 = nltk.DefaultTagger('NN')
            t1 = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t0)
            t2 = nltk.BigramTagger(self.brown_tagged_sents, backoff=t1)
            self.unigram_tagger = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t2)
            output = open("./Corpus/Brown-Uni.pkl", 'wb')
            dump(self.unigram_tagger, output, -1)
    def POSTag(self, text, s='false'):
        Method to POS tagged the Tokonized text.
        @param text: TOK text which is going to be POS tagged
        @param s: Whether is it a sentence of not. 
        @return: POSTaged version of input  
        if s == 'false':
            sentance = nltk.sent_tokenize(text)
            sentance = [nltk.word_tokenize(self.FF.strip(sent)) for sent in sentance]
            sentance = [self.unigram_tagger.tag(sent) for sent in sentance]
        elif s == 'tok':
            sentance = [self.unigram_tagger.tag(sent,) for sent in text]
            sentance = self.unigram_tagger.tag(text)
        return sentance
    def POSNgram(self, text, s='false', n=3):
        Method to POS tag N-grams 
        @param text: n-grams to be POS tagged
        @param s: Whether is it a sentence of not. 
        @param n: length of n gram  
        @return: POS-Tagged n-grams 
        if s == 'false':
            sentance = self.POSTag(text);
            sentence = [nltk.ngrams(sent, n) for sent in sentance]
            sentence = [nltk.ngrams(sent, n) for sent in text]
        return sentence
コード例 #3
ファイル: POS.py プロジェクト: danjamker/N-Fly
class POS(object):
    Class for POS tagging, use POS tagger from NLTK.
    def __init__(self):
        Constructor inisiates the filter. Along with the Taggers which will be used,
        And loads the copora. 
        self.FF = Filter()

            #Attempt to open .plk file and load.
            input = open("./Corpus/Brown-Uni.pkl", 'rb')
            self.unigram_tagger = load(input)
        except IOError as e:
            self.brown_tagged_sents = nltk.corpus.brown.tagged_sents(
            t0 = nltk.DefaultTagger('NN')
            t1 = nltk.UnigramTagger(self.brown_tagged_sents, backoff=t0)
            t2 = nltk.BigramTagger(self.brown_tagged_sents, backoff=t1)
            self.unigram_tagger = nltk.UnigramTagger(self.brown_tagged_sents,

            output = open("./Corpus/Brown-Uni.pkl", 'wb')
            dump(self.unigram_tagger, output, -1)

    def POSTag(self, text, s='false'):
        Method to POS tagged the Tokonized text.
        @param text: TOK text which is going to be POS tagged
        @param s: Whether is it a sentence of not. 
        @return: POSTaged version of input  
        if s == 'false':
            sentance = nltk.sent_tokenize(text)
            sentance = [
                nltk.word_tokenize(self.FF.strip(sent)) for sent in sentance
            sentance = [self.unigram_tagger.tag(sent) for sent in sentance]
        elif s == 'tok':
            sentance = [self.unigram_tagger.tag(sent, ) for sent in text]
            sentance = self.unigram_tagger.tag(text)

        return sentance

    def POSNgram(self, text, s='false', n=3):
        Method to POS tag N-grams 
        @param text: n-grams to be POS tagged
        @param s: Whether is it a sentence of not. 
        @param n: length of n gram  
        @return: POS-Tagged n-grams 
        if s == 'false':
            sentance = self.POSTag(text)
            sentence = [nltk.ngrams(sent, n) for sent in sentance]
            sentence = [nltk.ngrams(sent, n) for sent in text]

        return sentence