Example #1
0
def main():
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    text = open('document.txt').read() # nltk.corpus.gutenberg.raw('document.txt')
    sents = sent_tokenizer.tokenize(text)

    sqlVocab = SqliteVocabulary("studyenglish.db", "vocabulary")
    #sqlVocab.delete_vocabulary()

    for sent in sents:
        tokens = nltk.word_tokenize(sent)
        words = [w.lower() for w in tokens]
        vocab = sorted(set(words))

        for v in vocab:
            existed_word = sqlVocab.check_existed_word(v)
            if not existed_word:
                sqlVocab.insert_vocabulary(v, 1, "", "", strftime("%Y-%m-%d", gmtime()), sent)

    sqlVocab.commit()
    sqlVocab.close()
Example #2
0
        def nature_language_processing(self):
            sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
            text = st.get(1.0, END) # open('document.txt').read() # nltk.corpus.gutenberg.raw('document.txt')
            sents = sent_tokenizer.tokenize(text)
            words = nltk.word_tokenize(text)
            #fdist = FreqDist(words)

            sqlVocab = SqliteVocabulary("studyenglish.db", "vocabulary")
            sqlVocab.clear_local_count()
            for sent in sents:
                tokens = nltk.word_tokenize(sent)
                #words = [w.lower() for w in tokens]
                #vocab = sorted(set(words))
                tagged = nltk.pos_tag(tokens)

                for v, t in tagged:
                    #print(v,)
                    #print(t)
                    #print fdist.freq(v)
                    existed_word = sqlVocab.check_existed_word(v.lower())
                    #if (not v.isdigit()) and v.isalpha():
                    if (not existed_word):
                        sqlVocab.insert_vocabulary(v.lower(), "", "", t, "", "", sent, -2, strftime("%Y-%m-%d", gmtime()), 1, 1)
                    else:
                        sqlVocab.update_word_count(v.lower(), 1, 1)
            '''
            for v in fdist.keys():
                existed_word = sqlVocab.check_existed_word(v.lower())
                if existed_word:
                    sqlVocab.update_word_freq(v.lower(), fdist.freq(v), fdist[v])
            '''
            sqlVocab.commit()
            sqlVocab.close()

            self.show_all_words()
            self.master.destroy()