def ne_concat(node, result): if isinstance(node, nltk.Tree): if node.label() != 'S': node = (' '.join(word for word, tag in node), node.label()) result.append(nltk.tuple2str(node)) else: for child in node: ne_concat(child, result) else: #node = simplify_tag(node) result.append(nltk.tuple2str(node))
def word_tokenize(sent): nonlocal time_pos nonlocal time_chunk # replace typographic marks with simple marks sent = sent.replace('…', '...') sent = sent.replace('”', "''") sent = sent.replace('“', ',,') sent = sent.replace(',', ',') sent = sent.replace('’', "'") words = nltk.word_tokenize(sent) # strip punctuation from words words = [word.strip(string.punctuation) for word in words] words = [word for word in words if len(word) > 0] if not analyse_pos: return words else: start = time.time() tagged = tagger.tag(words) time_pos += (time.time() - start) if preserve_entities: start = time.time() chunks = nltk.ne_chunk(tagged, binary=ner_binary) time_chunk += (time.time() - start) word_list = [] ne_concat(chunks, word_list) return word_list else: return [nltk.tuple2str(t) for t in tagged]
def import_reuters_flat_pos(ds, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ tagger = nltk.data.load("./models/treebank_brill_aubt/treebank_brill_aubt.pickle") if not silent: total = len(reuters.sents()) counter = 0 root_handle = ds.insert("#reuters") for sent in reuters.sents(): sent = tagger.tag(sent) norm = [nltk.tuple2str(t) for t in sent] sen_handle = ds.insert(norm) ds.link(root_handle, sen_handle) if not silent: counter += 1 if (counter % 100 == 0): print("importing %s of %s sentences..." % (counter, total), file=log)
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout): """ Import the brown corpus into `ds`. E.g. >>> from nathan.core import Dataspace >>> ds = Dataspace() >>> %time brown.import_brown(ds, silent=True) CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s Wall time: 12min 29s """ if not silent: total = len(brown.sents()) counter = 0 for category in brown.categories(): cat_handle = ds.insert("#%s" % category) for sent in brown.tagged_sents(categories=category): if simplify_tags: norm = (simplify_tag(t) for t in sent) norm = [nltk.tuple2str(t) for t in norm] sen_handle = ds.insert(norm) ds.link(cat_handle, sen_handle) if not silent: counter += 1 if (counter % 100 == 0): print("importing %s of %s sentences..." % (counter, total), file=log)
def transfrom_data(data): sentences = [] word = '' for sent in data: for tuple_word_tag in sent: str = tuple2str(tuple_word_tag) word += str word += ' ' sentences.append(word) word = ' ' return sentences
def default_tag(reviews): """ Return the reviews with default tag """ for review in reviews: text = nltk.word_tokenize(review.content) tagged_tokens = nltk.pos_tag(text) tagged_content = '' for token in tagged_tokens: str_token = nltk.tuple2str(token, '/') tagged_content += str_token + ' ' review.content = tagged_content.strip() return reviews
def tag_by_training(trained_reviews, test_reviews): """ Train the trained reviews into Tagger Model, and tag test_reviews to be returned """ train_sent = review_to_sent(trained_reviews) unigram_tagger = nltk.UnigramTagger(train_sent) for test_review in test_reviews: text = nltk.word_tokenize(test_review.content) tagged_tokens = unigram_tagger.tag(text) tagged_content = '' for token in tagged_tokens: str_token = nltk.tuple2str(token, '/') tagged_content += str_token + ' ' test_review.content = tagged_content.strip() return test_reviews
def process_corpus(corpus_name): input_file = corpus_name + ".zip" corpus_contents = unzip_corpus(input_file) # testing #corpus_contents = input_file.read().decode('utf-8') # 1. Tokenizing # (a) Write the name of the corpus to stdout print("author:", corpus_name) # (b) Delimit the sentences for each document in the corpus. totalwords = [] totalsent = [] totaltags = [] sentcount = 0 # POS Tag Output File pos_file = open(corpus_name + "-pos.txt", 'w') for doc in corpus_contents: sentences = nltk.sent_tokenize(doc) for sentence in sentences: totalsent.append(sentence) sentcount = sentcount + 1 # Tokenize the words in each sentences of each document words = nltk.word_tokenize(sentence) for word in words: totalwords.append(word) # Part-of-Speech tagged = nltk.pos_tag(words) for tag in tagged: totaltags.append(tag) string = nltk.tuple2str(tag) print(string, file=pos_file, end=" ") print("\n", file=pos_file, end="") print("\n", file=pos_file, end="") print("word count:", len(totalwords)) waverage = sum(len(word) for word in totalwords) / len(totalwords) print("word avg:", waverage) saverage = len(totalwords) / sentcount print("sent avg:", saverage) pos_file.close() # average sentence length #for fileid in inaugural.fileids(): # avg = sum(len(sent) for sent in inaugural.sents(fileids=[fileid])) / len(inaugural.sents(fileids=[fileid])) # print(fileid, avg) # number of total words in the corpus wordCount = 0 wordCount = len(totalwords) #print("2. Total Words in the Corpus:", wordCount) # Frequency flat_words = [word.lower() for word in totalwords] vocabCount = 0 vocabCount = len(set(flat_words)) #print("3. Vocabulary Size of the Corpus:", vocabCount) tagged_fd = nltk.FreqDist(tag for (word, tag) in totaltags) #print("4. The most frequent part-of-speech tag is", tagged_fd.most_common(1)) # Frequency Output File freq_file = open(corpus_name + "-word-freq.txt", 'w') fdist = nltk.FreqDist(flat_words) print([ pair[0] for pair in sorted( fdist.items(), key=lambda item: item[1], reverse=True) ], file=freq_file) freq_file.close() # Conditional Frequency Distribution sys.stdout = open(corpus_name + "-pos-word-freq.txt", 'w') # Reverse pos_reversed = [(b, a.lower()) for a, b in totaltags] cdf1 = nltk.ConditionalFreqDist(pos_reversed) cdf1.tabulate() sys.stdout = sys.__stdout__ # Similar Words NNtags = [] VBDtags = [] JJtags = [] RBtags = [] punctags = [] for x in totaltags: if x[1] == 'NN': NNtags.append(x) elif x[1] == 'VBD': VBDtags.append(x) elif x[1] == 'JJ': JJtags.append(x) elif x[1] == 'RB': RBtags.append(x) elif x[1] == "." or "," or ";" or "-": punctags.append(x) punctratio = len(punctags) / len(totalwords) NNratio = len(NNtags) / len(totalwords) VBDratio = len(VBDtags) / len(totalwords) JJratio = len(JJtags) / len(totalwords) RBratio = len(RBtags) / len(totalwords) print("punctratio:", len(punctags) / len(totalwords)) print("NNratio:", len(NNtags) / len(totalwords)) print("VBDratio:", len(VBDtags) / len(totalwords)) print("JJratio:", len(JJtags) / len(totalwords)) print("RBratio:", len(RBtags) / len(totalwords)) download_dir = "training.csv" csv = open(download_dir, "a") columnTitleRow = "author,avgword,avgsent,punctratio,nnratio,vbdratio,rbratio,jjratio\n" #csv.write(columnTitleRow) row = corpus_name + "," + str(waverage) + "," + str(saverage) + "," + str( punctratio) + "," + str(NNratio) + "," + str(VBDratio) + "," + str( RBratio) + "," + str(JJratio) + "\n" csv.write(row) #print("5. The most frequent word in the POS (NN/VBD/JJ/RB) and respective similar words:") text = nltk.Text(flat_words) NN_fd = nltk.FreqDist(NNtags) #print("Most frequent NN =", NN_fd.most_common(1)) commonNN = NN_fd.most_common(1)[0][0][0] #print("Words similar to", commonNN, ":") #text.similar(commonNN) #print() VBD_fd = nltk.FreqDist(VBDtags) #print("Most frequent VBD =", VBD_fd.most_common(1)) commonVBD = VBD_fd.most_common(1)[0][0][0] #print("Words similar to", commonVBD, ":") #text.similar(commonVBD) #print() JJ_fd = nltk.FreqDist(JJtags) #print("Most frequent JJ =", JJ_fd.most_common(1)) commonJJ = JJ_fd.most_common(1)[0][0][0] #print("Words similar to", commonJJ, ":") #text.similar(commonJJ) #print() RB_fd = nltk.FreqDist(RBtags) #print("Most frequent RB =", RB_fd.most_common(1)) commonRB = RB_fd.most_common(1)[0][0][0] #print("Words similar to", commonRB, ":") #text.similar(commonRB) #print() # 5. Collocations co_text = nltk.Text(flat_words)
def __str__(self): return " ".join(tuple2str(tt) for tt in \ zip(self.tokens, self.tags))
def result(): # get text from textbox data = request.form.get('text') totalwords = [] totalsent = [] totaltags = [] sentcount = 0 # tokenize into sentences sentences = nltk.sent_tokenize(data) # tokenize into words and create part of speech tags for sentence in sentences: totalsent.append(sentence) sentcount = sentcount + 1 words = nltk.word_tokenize(sentence) for word in words: totalwords.append(word) tagged = nltk.pos_tag(words) for tag in tagged: totaltags.append(tag) string = nltk.tuple2str(tag) # calculate word and sentence average waverage = sum(len(word) for word in totalwords) / len(totalwords) wtrunc = '%.3f' % (waverage) saverage = len(totalwords) / sentcount strunc = '%.3f' % (saverage) # add up total number of pos tags NNtags = [] VBDtags = [] JJtags = [] RBtags = [] punctags = [] for x in totaltags: if x[1] == 'NN': NNtags.append(x) elif x[1] == 'VBD': VBDtags.append(x) elif x[1] == 'JJ': JJtags.append(x) elif x[1] == 'RB': RBtags.append(x) elif x[1] == "." or "," or ";" or "-": punctags.append(x) # calculate part of speech ratios punctratio = len(punctags) / len(totalwords) NNratio = len(NNtags) / len(totalwords) VBDratio = len(VBDtags) / len(totalwords) JJratio = len(JJtags) / len(totalwords) RBratio = len(RBtags) / len(totalwords) # create csv for machine learning model open('user.csv', 'w').close() # erase file download_dir = "user.csv" csv = open(download_dir, "a") columnTitleRow = "author,avgword,avgsent,punctratio,nnratio,vbdratio,rbratio,jjratio\n" csv.write(columnTitleRow) row = "user," + str(waverage) + "," + str(saverage) + "," + str(punctratio) + "," + str(NNratio) + "," + str(VBDratio) + "," + str( RBratio) + "," + str(JJratio) + "\n" csv.write(row) csv.close() # use already generated pickle file to predict author test_file = "user.csv" df1 = pd.read_csv(test_file, header=0) test_data = df1.iloc[:, 1:] model2 = joblib.load("file.pkl") preds2 = model2.predict(test_data) # truncate to 3 decimal places and add % NNratio = NNratio * 100 ntrunc = '%.3f' % (NNratio) VBDratio = VBDratio * 100 vtrunc = '%.3f' % (VBDratio) JJratio = JJratio * 100 jtrunc = '%.3f' % (JJratio) RBratio = RBratio * 100 rtrunc = '%.3f' % (RBratio) # put author guess and stats into an array response = [] response.append(preds2[0]) response.append(str(wtrunc)) response.append(str(strunc)) response.append(str(ntrunc) + "%") response.append(str(vtrunc) + "%") response.append(str(rtrunc) + "%") response.append(str(jtrunc) + "%") # redirecting to appropriate author function with the user stats if response[0] == "John Steinbeck": return redirect(url_for("steinbeck", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Mark Twain": return redirect(url_for("twain", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Mary Shelley": return redirect(url_for("shelley", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Jane Austen": return redirect(url_for("austen", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Jacob Grimm and Wilhelm Grimm": return redirect(url_for("grimm", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Isaac Asimov": return redirect(url_for("asimov", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "H.P Lovecraft": return redirect(url_for("lovecraft", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "F Scott Fitzgerald": return redirect(url_for("fitzgerald", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Ernest Hemingway": return redirect(url_for("hemingway", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Edgar Allan Poe": return redirect(url_for("poe", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "CS Lewis": return redirect(url_for("lewis", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Arthur C Clark": return redirect(url_for("clarke", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) elif response[0] == "Agatha Christie": return redirect(url_for("christie", awl=response[1], asl=response[2], nr=response[3], vr=response[4], avr=response[5], ajr=response[6])) return (response)
def __str__(self): return " ".join(tuple2str(tt) for tt in zip(self.tokens, self.tags))
def process_corpus(): #corpus_contents = ' '.join(sys.argv[1:]) inputfile = corpus_name + ".txt" corpus_contents = open(inputfile, 'r').read() totalwords = [] totalsent = [] totaltags = [] sentcount = 0 # tokenize into sentences sentences = nltk.sent_tokenize(corpus_contents) # tokenize into words and create part of speech tags for sentence in sentences: totalsent.append(sentence) sentcount = sentcount + 1 words = nltk.word_tokenize(sentence) for word in words: totalwords.append(word) tagged = nltk.pos_tag(words) for tag in tagged: totaltags.append(tag) string = nltk.tuple2str(tag) # calculate word and sentence average waverage = sum(len(word) for word in totalwords) / len(totalwords) wtrunc = '%.3f' % (waverage) saverage = len(totalwords) / sentcount strunc = '%.3f' % (saverage) # add up total number of pos tags NNtags = [] VBDtags = [] JJtags = [] RBtags = [] punctags = [] for x in totaltags: if x[1] == 'NN': NNtags.append(x) elif x[1] == 'VBD': VBDtags.append(x) elif x[1] == 'JJ': JJtags.append(x) elif x[1] == 'RB': RBtags.append(x) elif x[1] == "." or "," or ";" or "-": punctags.append(x) # calculate part of speech ratios punctratio = len(punctags) / len(totalwords) NNratio = len(NNtags) / len(totalwords) VBDratio = len(VBDtags) / len(totalwords) JJratio = len(JJtags) / len(totalwords) RBratio = len(RBtags) / len(totalwords) # create csv for machine learning model open('user.csv', 'w').close() #erase file download_dir = "user.csv" csv = open(download_dir, "a") columnTitleRow = "author,avgword,avgsent,punctratio,nnratio,vbdratio,rbratio,jjratio\n" csv.write(columnTitleRow) row = "user," + str(waverage) + "," + str(saverage) + "," + str( punctratio) + "," + str(NNratio) + "," + str(VBDratio) + "," + str( RBratio) + "," + str(JJratio) + "\n" csv.write(row) csv.close() # use already generated pickle file to predict author test_file = "user.csv" df1 = pd.read_csv(test_file, header=0) test_data = df1.iloc[:, 1:] model2 = joblib.load("file.pkl") preds2 = model2.predict(test_data) # truncate to 3 decimal places and add % NNratio = NNratio * 100 ntrunc = '%.3f' % (NNratio) VBDratio = VBDratio * 100 vtrunc = '%.3f' % (VBDratio) JJratio = JJratio * 100 jtrunc = '%.3f' % (JJratio) RBratio = RBratio * 100 rtrunc = '%.3f' % (RBratio) # put author guess and stats into an array response = [] response.append(preds2[0]) response.append(str(wtrunc)) response.append(str(strunc)) response.append(str(ntrunc) + "%") response.append(str(vtrunc) + "%") response.append(str(jtrunc) + "%") response.append(str(rtrunc) + "%") print(response) return (response)
def process_corpus(corpus_name): input_file = corpus_name + ".zip" corpus_contents = unzip_corpus(input_file) # Your code goes here file_name = corpus_name+"-pos.txt" freq_file = corpus_name+"-word-freq.txt" cond_file = corpus_name+"-pos-word-freq.txt" #write name of corpous to stdout totalcount = 0 vocabsize = 0 poslist = [] taglist = [] wordslist = [] unqiuelist = [] wordstr = [] tuplearr = [] normaltext = [] beginarr = [] corpusarr = [] with open(file_name,'a') as f: for doc in corpus_contents: #delimit the sentences for each document in the corpus sent = nltk.sent_tokenize(doc) #part-of-speech tagger to each tokenize sent #tokenize the words of each sentence of each doc words = [nltk.word_tokenize(item) for item in sent] for word in words: corpusarr.append(word) #lowercase words flat_words = [term.lower() for term in word] lowerfreq = nltk.FreqDist(flat_words) vocabsize += lowerfreq.B() #make array of tokenized words of corpus for i in word: normaltext.append(i) #make an array of lowercase tokenized words of corpus for i in flat_words: wordstr.append(i) #count total words in corpus freq = nltk.FreqDist(word) totalcount += freq.N() #pos tagging poslist = nltk.pos_tag(word) #reverse tuple of pos tagging for item in poslist: tlist = tuple(reversed(item)) tuplearr.append(tlist) beginarr.append(item) #make second value of tuple lowercase newtuple = [(pos,word.lower()) for pos,word in tuplearr] # print(newtuple) #most freq part of speech pos_counter = nltk.FreqDist(pos for (word, pos) in poslist) for word,tag in poslist: wordslist.append(word) taglist.append(tag) for val in poslist: combined = nltk.tuple2str(val) f.write(combined) f.write(" ") f.write("\n") f.write("\n") #gett frequency of unique words uniquefreq = nltk.FreqDist(wordstr) uniquedict = uniquefreq.most_common(15000) with open(freq_file,'a') as r: for k,j in uniquedict: r.write(k+", "+ str(j)) r.write("\n") tagfreq = nltk.FreqDist(taglist) winner = tagfreq.most_common(50) print("1. Corpus name:", corpus_name) print("2. Total words in corpus", totalcount) print("3. Vocabulary size of the corpus", uniquefreq.B()) print("4. The most frequent part-of-speech tag is", winner[0][0], "with frequency", winner[0][1]) condfileoutput = open(cond_file,'a') sys.stdout = condfileoutput cflist = nltk.ConditionalFreqDist(newtuple) cflist.tabulate() sys.stdout = sys.__stdout__ print(cflist) noun = cflist['NN'].most_common(1) noun1 = noun[0][0] print(normaltext) text = nltk.Text(normaltext) print("5. The most frequent word in the POS(NN) is:", noun1,"and its similar words are:") text.similar(noun1) vbd = cflist['VBD'].most_common(1) vbd1 = vbd[0][0] print("5. The most frequent word in the POS(VBD) is:",vbd1,"and its similar words are:") text.similar(vbd1) jj = cflist['JJ'].most_common(1) jj1 = jj[0][0] print("5. The most frequent word in the POS(JJ) is:",jj1,"and its similar words are:") text.similar(jj1) rb = cflist['RB'].most_common(1) rb1 = rb[0][0] print("5. The most frequent word in the POS(RB) is:",rb1,"and its similar words are:") text.similar(rb1) print("6. Collocations:") text.collocations() pass