def main(): sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') text = open('document.txt').read() # nltk.corpus.gutenberg.raw('document.txt') sents = sent_tokenizer.tokenize(text) sqlVocab = SqliteVocabulary("studyenglish.db", "vocabulary") #sqlVocab.delete_vocabulary() for sent in sents: tokens = nltk.word_tokenize(sent) words = [w.lower() for w in tokens] vocab = sorted(set(words)) for v in vocab: existed_word = sqlVocab.check_existed_word(v) if not existed_word: sqlVocab.insert_vocabulary(v, 1, "", "", strftime("%Y-%m-%d", gmtime()), sent) sqlVocab.commit() sqlVocab.close()
def nature_language_processing(self): sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') text = st.get(1.0, END) # open('document.txt').read() # nltk.corpus.gutenberg.raw('document.txt') sents = sent_tokenizer.tokenize(text) words = nltk.word_tokenize(text) #fdist = FreqDist(words) sqlVocab = SqliteVocabulary("studyenglish.db", "vocabulary") sqlVocab.clear_local_count() for sent in sents: tokens = nltk.word_tokenize(sent) #words = [w.lower() for w in tokens] #vocab = sorted(set(words)) tagged = nltk.pos_tag(tokens) for v, t in tagged: #print(v,) #print(t) #print fdist.freq(v) existed_word = sqlVocab.check_existed_word(v.lower()) #if (not v.isdigit()) and v.isalpha(): if (not existed_word): sqlVocab.insert_vocabulary(v.lower(), "", "", t, "", "", sent, -2, strftime("%Y-%m-%d", gmtime()), 1, 1) else: sqlVocab.update_word_count(v.lower(), 1, 1) ''' for v in fdist.keys(): existed_word = sqlVocab.check_existed_word(v.lower()) if existed_word: sqlVocab.update_word_freq(v.lower(), fdist.freq(v), fdist[v]) ''' sqlVocab.commit() sqlVocab.close() self.show_all_words() self.master.destroy()