def read_and_train_doc2vec(root_dir, fileids, output_file='', options={}): fileids = fileids if isinstance(fileids, list) else [fileids] fileids = [unicode(f, 'utf8') for f in fileids] output_file = output_file or '-'.join(fileids) output_file = u"{0}{1}-{2}".format(MODELS_DIR, output_file, options_to_string(options)) reader = PlaintextCorpusReader(root=root_dir, fileids=fileids) try: docs = [ TaggedDocument(reader.words(fileid), [fileid]) for fileid in fileids ] train_and_save_doc2vec(docs, output_file, options) except UnicodeDecodeError: file_encodings = {} for fileid in fileids: file_content = open(root_dir + fileid).read() file_encoding = chardet.detect(file_content) file_encodings[fileid] = file_encoding['encoding'] reader._encoding = file_encodings pdb.set_trace() docs = [ TaggedDocument(reader.words(fileid), [fileid]) for fileid in fileids ] train_and_save_doc2vec(docs, output_file, options)
def compare(request): errors = [] statistics=[] stats=[] for x in range(1,3): cantoname = "canto"+str(x)+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append(x) statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) stats.append(statistics) return render_to_response('compare.html', {'stats':statistics})
def my_bar(self, corpus, patt, n): wordlists = PlaintextCorpusReader(corpus, patt) fileids = wordlists.fileids() k = len(fileids) figA = pylab.figure(1) figB = pylab.figure(2) li = ['Brown corpus'] for id in fileids: if k > 1: i = fileids.index(id) + 1 words = wordlists.words(id) fre = FreqDist(word.lower() for word in words if word.isalpha()) self.bar_count(fre, n, figA, 2 * k, 2 * i, id, li) self.bar_freq(fre, n, figB, 2 * k, 2 * i, id, li) figA.savefig('/home/camilo/Desktop/complex-freq.pdf') figB.savefig('/home/camilo/Desktop/complex-relfreq.pdf') else: words = wordlists.words(id) fre = FreqDist(word.lower() for word in words if word.isalpha()) self.bar_count(fre, n, figA, k, 1, id, li) self.bar_freq(fre, n, figB, k, 1, id, li) figA.savefig('/home/camilo/Desktop/simple-freq.pdf') figB.savefig('/home/camilo/Desktop/simple-relfreq.pdf') pylab.show()
def get_stopwords(wdir, inpath, outfile, mfw): """ Arguments: wdir (str): path to the working directory inpath (str): relative path to the input directory outfile (str): relative path to the output file mfw (int): number of most frequent words to include in the stop word list """ print("starting: get_stopwords...") corpus = PlaintextCorpusReader(os.path.join(wdir, inpath), ".*") #print(corpus.fileids()) print("words in the corpus: " + str(len(corpus.words()))) fdist_corpus = FreqDist(corpus.words()) with open(os.path.join(wdir, outfile), "w", encoding="utf-8") as stopwords_out: # from the list of tuples, create a list with the X MFW top_words = [w[0] for w in fdist_corpus.most_common(mfw)] # store list, one word per line stopwords_out.write("\n".join(top_words)) print("Done!")
def stats(request): errors = [] statistics=[] if 'q' in request.GET: q = request.GET['q'] if not q: errors.append('Enter a Canto Number') else: cantoname = "canto"+q+".txt" w=PlaintextCorpusReader("./",cantoname); w.words(); t=nltk.text.Text(w.words()); l_lines=len(line_tokenize(w.raw())) l_uwords=len(set(w.words())) l_words=len(w.words()) l_sents=len(w.sents()) l_paras=len(w.paras()) l_linperpara=l_lines/l_paras statistics.append("Number of Words - "+ str(l_words)) statistics.append("Number of Unique Words - "+ str(l_uwords)) statistics.append("Number of Setences - "+ str(l_sents)) statistics.append("Number of Lines - "+ str(l_lines)) statistics.append("Number of Paras - "+ str(l_paras)) statistics.append("Number of Lines/Paras - "+ str(l_linperpara)) lexical_density=l_words/l_uwords l_wordpersent = l_words/l_sents statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density)) statistics.append("Words per sentence - "+ str(l_wordpersent)) return render_to_response('stats.html', {'statistics':statistics}) return render_to_response('stats.html', {'errors': errors})
def my_count(self, corpus, patt, n, filename): wordlists = PlaintextCorpusReader(corpus, patt) fileids = wordlists.fileids() res = [] for id in fileids: leng = len(wordlists.words(id)) wordc = len(set(wordlists.words(id))) wor = "=> num corpus words: " + ` leng ` dis = "=> num distinct words: " + ` wordc ` ric = "=> ind lex richness: " + ` leng / wordc ` res.append(dis) res.append(ric) res.append(wor) for word in sorted(set(wordlists.words(id))): freq = (wordlists.words(id)).count(word) f = "(" + word.lower() + "," + ` round(100 * (freq / leng), 1) ` + ")" t = "(" + word.lower() + "," + ` freq ` + "/" + ` leng ` + ")" res.append(f) res.append(t) out = open("/home/camilo/" + filename, "w") try: for t in res[:n]: out.write(t + "\n") finally: out.close()
def save_my_count(self,corpus,patt,n,filename): wordlists = PlaintextCorpusReader(corpus,patt) fileids = wordlists.fileids() res = [] for id in fileids: leng = len(wordlists.words(id)) wordc = len(set(wordlists.words(id))) wor = "=> corpus tokens: " + `leng` + "\n" dis = "=> corpus token types: " + `wordc` + "\n" ric = "=> ind lex richness: " + `leng / wordc` + "\n" res.append(dis) res.append(ric) res.append(wor) for word in sorted(set(wordlists.words(id))): freq = (wordlists.words(id)).count(word) f = "(" + word.lower() + "," + `round(100 * (freq / leng),1)` + ")\n" t = "(" + word.lower() + "," + `freq` + "/" + `leng` + ")" res.append(f) res.append(t) out = open("../data/"+filename,"w") try: for t in res[:n]: out.write(t + "\n") finally: out.close()
def hybrid_cfdist(): sherlock_corpus = PlaintextCorpusReader(CORPUS_ROOT_SHERLOCK, '.*', encoding='utf-8') sherlock_bigrams = nltk.bigrams(sherlock_corpus.words()) pokemon_corpus = PlaintextCorpusReader(CORPUS_ROOT_POKEMON, '.*', encoding='utf-8') pokemon_bigrams = nltk.bigrams(pokemon_corpus.words()) return nltk.ConditionalFreqDist(sherlock_bigrams + pokemon_bigrams)
def main(): current_directory = os.path.dirname(__file__) corpus_root = os.path.abspath(current_directory) wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt') wordlists.fileids() ClassEvent = nltk.Text(wordlists.words()) CEWords = [ "Long Island", "Weather Service", "flooding", "August", "heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall", "record" ] # ClassEvent Statistics print "--------- CLASS EVENT STATISTICS -------------" print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent) print "ClassEvent WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(ClassEvent) print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(ClassEvent, CEWords) ClassEventLettersPerWord = average_letters_per_word(ClassEvent) ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents()) ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \ ClassEventWordsPerSent) - 21.43 print "Average number of letters per word", ClassEventLettersPerWord print "Average number of words per sentence:", ClassEventWordsPerSent print "Automated Readability Index:", ClassEventARI print wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt") wordlists_event.fileids() YourSmall = nltk.Text(wordlists_event.words()) SmallEventWords = [ "Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", "Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home", "Weather", "Debris", "Death", "Smoke", "State", "Ash" ] # YourSmall statistics print "--------- YOUR SMALL STATISTICS --------------" print "Texas_Wild_Fire", non_stopword_fraction(YourSmall) print "YourSmall WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(YourSmall) print "YourSmall PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(YourSmall, SmallEventWords) YourSmallLettersPerWord = average_letters_per_word(YourSmall) YourSmallWordsPerSent = len(wordlists_event.words()) / \ len(wordlists_event.sents()) YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \ YourSmallWordsPerSent) - 21.43 print "Average number of letters per word", YourSmallLettersPerWord print "Average number of words per sentence:", YourSmallWordsPerSent print "Automated Readability Index", YourSmallARI
def get_coarse_level_features(dataset, output_file): # accessing the corpus corpus_root = '/home1/c/cis530/data-hw2/' dataset_path = corpus_root + dataset # Reading the files from the directories files = PlaintextCorpusReader(dataset_path, '.*') ids = files.fileids() stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt') stops = stopFile.words() #Opening a file that has to be written to out = open(output_file, 'w') for i in range(0,len(ids) - 1): #Initializing certain variables tokens_count=0 types = 0 non_stops_count=0 sents_count = 0 avg_sent_len=0 cap_count = 0 tokens=files.words(ids[i]) #Computing Number of Tokens tokens_count = len(tokens) #Computing Number of types types = len(set(tokens)) non_stops=[] #Computing Number of Content Words for t in tokens: if t not in stops: non_stops.append(t) non_stops_count = len(non_stops) #Finding Average Sentence Length sent = [] sent = files.sents(ids[i]) sents_count = len(sent) sent_len=0 for s in sent: sent_len = sent_len + len(s) avg_sent_len = sent_len/float(sents_count) #Computing Number of Captilized Words for c in non_stops: if c.istitle(): cap_count = cap_count+1 current_file = dataset + '/' + ids[i] e = current_file.split('/') out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \ str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n') out.flush()
def corpus_metrics(self, corpus_path): corpus_news = PlaintextCorpusReader(corpus_path, '.*\.txt') print('Corpus documents', len(corpus_news.fileids())) print('Train documents', len([c for c in corpus_news.fileids() if c.startswith('train')])) print('Dev documents', len([c for c in corpus_news.fileids() if c.startswith('dev')])) print('Test documents', len([c for c in corpus_news.fileids() if c.startswith('test')])) words = set(corpus_news.words()) words = sorted(words) print('Corpus different words', len(words)) longwords = [w for w in corpus_news.words() if len(w) > 2] fdist = nltk.FreqDist(longwords) bigramController = BigramController() bigrams = bigramController.BuildBrigramFeatures(longwords) bigramController.BigramStatistics(bigrams) trigramdist = nltk.FreqDist(nltk.trigrams(longwords)) #fdist.plot(50, cumulative=False) print(fdist.most_common(20)) print("Trigram distribution") print(trigramdist.most_common(20)) words_attack = [] files_attack = [f for f in corpus_news.fileids() if os.path.basename(os.path.normpath(f)).startswith('attack--')] for file in files_attack: for w in corpus_news.words(file): words_attack.append(w) words_nonattack = [] files_nonattack = [f for f in corpus_news.fileids() if os.path.basename(os.path.normpath(f)).startswith('nonattack--')] for file in files_nonattack: for w in corpus_news.words(file): words_nonattack.append(w) words_bag = { } words_bag['attack'] = words_attack words_bag['nonattack'] = words_nonattack #print(words_bag['attack']) cfd = nltk.ConditionalFreqDist((category, word) for category in ['attack', 'nonattack'] for word in words_bag[category] )
def main(): current_directory = os.path.dirname(__file__) corpus_root = os.path.abspath(current_directory) wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt') wordlists.fileids() ClassEvent = nltk.Text(wordlists.words()) CEWords = ["Long Island", "Weather Service", "flooding", "August", "heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall", "record"] # ClassEvent Statistics print "--------- CLASS EVENT STATISTICS -------------" print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent) print "ClassEvent WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(ClassEvent) print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(ClassEvent, CEWords) ClassEventLettersPerWord = average_letters_per_word(ClassEvent) ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents()) ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \ ClassEventWordsPerSent) - 21.43 print "Average number of letters per word", ClassEventLettersPerWord print "Average number of words per sentence:", ClassEventWordsPerSent print "Automated Readability Index:", ClassEventARI print wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt") wordlists_event.fileids() YourSmall = nltk.Text(wordlists_event.words()) SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", "Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home", "Weather", "Debris", "Death", "Smoke", "State", "Ash"] # YourSmall statistics print "--------- YOUR SMALL STATISTICS --------------" print "Texas_Wild_Fire", non_stopword_fraction(YourSmall) print "YourSmall WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(YourSmall) print "YourSmall PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(YourSmall, SmallEventWords) YourSmallLettersPerWord = average_letters_per_word(YourSmall) YourSmallWordsPerSent = len(wordlists_event.words()) / \ len(wordlists_event.sents()) YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \ YourSmallWordsPerSent) - 21.43 print "Average number of letters per word", YourSmallLettersPerWord print "Average number of words per sentence:", YourSmallWordsPerSent print "Automated Readability Index", YourSmallARI
def generate_scoring_dictionary(): # create lists of positive and negative words using Hu and Liu (2004) lists positive_list = PlaintextCorpusReader(directory, 'Hu_Liu_positive_word_list.txt', encoding = 'latin-1') negative_list = PlaintextCorpusReader(directory, 'Hu_Liu_negative_word_list.txt', encoding = 'latin-1') positive_words = positive_list.words() negative_words = negative_list.words() # define bag-of words dictionaries positive_scoring = dict([(positive_words, 1) for positive_words in positive_words]) negative_scoring = dict([(negative_words, -1) for negative_words in negative_words]) scoring_dictionary = dict(positive_scoring.items() + negative_scoring.items()) return(scoring_dictionary)
def loadCorpora(): corpus_root = '/usr/share/dict' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19]
def get_lm_features(dataset, output_file): corpus_root = '/home1/c/cis530/data-hw2/' bigram_root = corpus_root + 'Language_model_set/' fin_files = PlaintextCorpusReader(bigram_root+'Finance/','.*') fin_words = list(fin_files.words()) fin_model = NGramModel(fin_words, 2) health_files = PlaintextCorpusReader(bigram_root+'Health/','.*') health_words = list(health_files.words()) health_model = NGramModel(health_words, 2) res_files = PlaintextCorpusReader(bigram_root+'Research/','.*') res_words = list(res_files.words()) res_model = NGramModel(res_words, 2) com_files = PlaintextCorpusReader(bigram_root+'Computers_and_the_Internet/','.*') com_words = list(com_files.words()) com_model = NGramModel(com_words, 2) test_files = PlaintextCorpusReader(corpus_root+dataset, '.*') ids = test_files.fileids() out_file = open(output_file,'w') for j in range(0,len(ids)): file_words = test_files.words(ids[j]) out_str = '' current_file = dataset + '/'+ids[j] e = current_file.split('/') out_str = out_str + current_file+ ' '+e[-2] sum_fin=0 sum_health=0 sum_res=0 sum_com=0 text_len = len(file_words) for i in range(1,len(file_words)): sum_fin = sum_fin + math.log(fin_model.prob((file_words[i-1],),file_words[i])) comp_fin = float((-sum_fin)*(1/float(text_len))) sum_health = sum_health + math.log(health_model.prob((file_words[i-1],),file_words[i])) comp_health = (float(-sum_health))*(1/float(text_len)) sum_res = sum_res + math.log(res_model.prob((file_words[i-1],),file_words[i])) comp_res = (float(-sum_res))*(1/float(text_len)) sum_com = sum_com + math.log(com_model.prob((file_words[i-1],),file_words[i])) comp_com = (float(-sum_com))*(1/float(text_len)) out_str = out_str + ' finprob:'+str(round(sum_fin,2))+' hlprob:'+str(round(sum_health,2))+' resprob:'\ +str(round(sum_res,2))+ ' coprob:' + str(round(sum_com,2)) + ' finper:' + str(round(comp_fin,2)) + ' hlper:'+\ str(round(comp_health,2))+ ' resper:' + str(round(comp_res,2)) + ' coper:' + str(round(comp_com,2)) out_file.write(out_str + '\n') out_file.flush()
def get_sent(): corpus_root = 'dict' # corpus_root = '/Users/abirqasem/nlp/dict' sentence = PlaintextCorpusReader(corpus_root, 'test3.txt') words = sentence.words() result = {"sentence": sentence, "words": words} return result
def prepare_pos_features(Language_model_set, output_file): corpus_root = '/home1/c/cis530/data-hw2/' + Language_model_set texts = PlaintextCorpusReader(corpus_root, '.*') text = texts.words() tagged_text = nltk.pos_tag(text) merged_tag_text = mergeTags(tagged_text) lists = seperate_pos(merged_tag_text) nouns_dist = FreqDist(lists[0]) top_nouns = nouns_dist.keys()[:200] verbs_dist = FreqDist(lists[1]) top_verbs =verbs_dist.keys()[:200] advs_dist = FreqDist(lists[2]) top_advs =advs_dist.keys()[:100] prep_dist = FreqDist(lists[3]) top_preps =prep_dist.keys()[:100] adjs_dist = FreqDist(lists[4]) top_adjs =adjs_dist.keys()[:200] out = open(output_file, 'w') for n in top_nouns: out.write('NN'+ n + '\n') for v in top_verbs: out.write('VV'+ v + '\n') for av in top_advs: out.write('ADV'+ av + '\n') for p in top_preps: out.write('PREP'+ p + '\n') for aj in top_adjs: out.write('ADJ'+ aj + '\n')
def extract_keys(deal_dirs, save_path, numbers, save_file, flag): """ 批量提取文本中的出现频率最高的关键词 :param deal_dirs: 处理文本的父目录 :param save_path: 处理结果保存的路径 :param numbers: 提取词数量 :param save_file: 提取结果保存的文件 :return: """ news_corpus = PlaintextCorpusReader(deal_dirs, '.*') files = news_corpus.fileids() print(files) # 创建保存文件目录 if not os.path.exists(save_path): os.makedirs(save_path) print(save_path, ' 创建成功!') savepath = os.path.join(save_path, save_file) for file in files: deal_f = PlaintextCorpusReader(deal_dirs, ['{}'.format(file)]) word_list = deal_f.words() fdist1 = nltk.FreqDist(word_list) result = fdist1.most_common(numbers) print(result) ss = '' for word, num in result: if flag: ss += ("%s: %s " % (word, num)) ss += word + " " save_analy_result(savepath, ss+'\n')
def corpus_from_directory(path, filetype='.*'): ''' Make a corpus of all files in a given directory. Can limit type by passing the desired extension, proper format is, e.g., '.*\.txt' ''' corpus_reader = PlaintextCorpusReader(path, filetype) return nltk.Text( corpus_reader.words() )
def setData(domain): # domain variable can take one of the following values # # "chicago_crime_data", # "economics", # "software_vulnerability", # "cyber_threat", # "articles", # "msds" corpus_root = getRoot(domain) # based on the selected domain corpus root will hold the relative address of the corpus wordlists = PlaintextCorpusReader(corpus_root, '.*') # NLTK's laintextCorpusReader load text files in the root words = wordlists.words() # and extract all the words in each file my_stopwords = nltk.corpus.stopwords.words('english') # my_stopwords holds a list of non-relevant (stop) words in english content = [w for w in words if w.lower() not in my_stopwords] # stop words are removed content = [w for w in content if len(w) > 2] # words shorther than two(2) characters are removed content = [w for w in content if not w.isdigit()] # digit only words (e.g. "10", "30", "450") are removed result = {} # a list of related words is created for each word in the content variable for word in content: result[word] = [] for sset in wn.synsets(word): # the first synonym of a set is selected, this can be expanded to the rest of the words in the set for more accuracy but at the cost of performace for synset in sset.hyponyms(): # a set of hyponyms is added for the main synonym result[word].append(synset.name[0:synset.name.find('.')]) return result,content # both the synonyms and the original word corpus is returned
def plot_cfreq(self,corpus,patt,n): wordlists = PlaintextCorpusReader(corpus,patt) fileids = wordlists.fileids() for id in fileids: words = wordlists.words(id) fre = FreqDist(word.lower() for word in words if word.isalpha()) return fre.plot(n,cumulative=True)
def get_coarse_level_features(dataset, output_file): # Import the corpus reader corpus_root = '/home1/c/cis530/data-hw2/'+dataset # Define the folder where the files are situated files_dataset = PlaintextCorpusReader(corpus_root, '.*') # Open the output_file output = open('/home1/c/cis530/data-hw2/'+output_file,'w') # Read the stopwlist stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read() types_stop_list=stop_list.split() for fileid in files_dataset.fileids(): # Output the docid output.write(dataset+'/'+fileid+' ') # Output the topic_name topic_name=fileid.split('/')[0] output.write(topic_name+' ') # Output the num_tokens tokens=files_dataset.words(fileid) output.write('tok:'+str(len(tokens))+' ') # Output the num_types types=set(tokens) output.write('typ:'+str(len(types))+' ') # Output the num_contents output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ') # Output the num_sents sents = files_dataset.sents(fileid) output.write('sen:'+str(len(sents))+' ') # Output the avg_slen avg_slen=round(float(len(tokens))/float(len(sents)),2) output.write('len:'+str(avg_slen)+' ') # Output the num_caps output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z']))) output.write('\n') output.close()
def plot_cfreq(self, corpus, patt, n): wordlists = PlaintextCorpusReader(corpus, patt) fileids = wordlists.fileids() for id in fileids: words = wordlists.words(id) fre = FreqDist(word.lower() for word in words if word.isalpha()) return fre.plot(n, cumulative=True)
def get_all_words(): #direc = "../nouns_steps/whole/" direc = "../filtered_letters/" words = set() corpus = PCR(direc, '.*') all_files = corpus.fileids() txt_files = [] #zuerst Liste aller zu durchsuchenden Files erstellen: for file in all_files: if ".txt" in file: txt_files.append(file) #dann aus jedem dieser Files alle Wörter zum Set hinzufügen(doppelte werden übergangen) for file in txt_files: text = corpus.words(file) for word in text: words.add(word.casefold()) words = sorted(words) #print(words) #print(len(words)) # json.dump(words, open("../all_nouns.json", 'w')) # with open('../all_nouns_set.pickle', 'wb') as f: # pickle.dump(words, f) return words
class BigramModel: category_root=[] files_dataset_category=[] word_list=[] bigram=[] fd = [] cfd = [] def __init__(self,category,corpus_root): self.category_root=[] self.files_dataset_category=[] self.word_list=[] self.bigram=[] self.fd = [] self.cfd = [] self.category_root=corpus_root+'/'+category self.files_dataset_category=PlaintextCorpusReader(self.category_root,'.*') self.word_list = self.files_dataset_category.words() self.bigram = nltk.bigrams(self.word_list) self.fd = FreqDist(self.word_list) self.cfd = nltk.ConditionalFreqDist(self.bigram) def get_prob_and_per(self,word_list): # The function takes a word_list and return both the log probability and log perplexity under the language model n_types = len(set(word_list)) n_tokens=len(word_list) # Calculate Log Prob with Laplace smoothing. log_prob = math.log(self.fd[word_list[0]]+1)-math.log(n_tokens+n_types) #initializing prob for the first word for (w1,w2) in nltk.bigrams(word_list): log_prob = log_prob+math.log(self.cfd[w1][w2]+1)-math.log(len(self.cfd[w1].keys())+n_types) # Calculate Log Perplexity log_per=float(1)/float(-n_tokens)*log_prob return log_prob, log_per
def get_lm_features(dataset,output_file): # Import the corpus reader corpus_root = '/home1/c/cis530/data-hw2/'+dataset # Define the folder where the files are situated files_dataset = PlaintextCorpusReader(corpus_root, '.*') fin_model = BigramModel('Finance',corpus_root) hel_model = BigramModel('Health',corpus_root) res_model = BigramModel('Computers_and_the_Internet',corpus_root) co_model = BigramModel('Research',corpus_root) output = open('/home1/c/cis530/data-hw2/'+output_file,'w') for fileid in files_dataset.fileids(): # Output the docid output.write(dataset+'/'+fileid+' ') # Output the topic_name topic_name=fileid.split('/')[0] output.write(topic_name+' ') word_list = files_dataset.words(fileid) finprob,finper = fin_model.get_prob_and_per(word_list) hlprob,hlper = hel_model.get_prob_and_per(word_list) resprob,resper = res_model.get_prob_and_per(word_list) coprob,coper = co_model.get_prob_and_per(word_list) output.write('finprob:'+str(round(finprob,1))+' ') output.write('hlprob:'+str(round(hlprob,1))+' ') output.write('resprob:'+str(round(resprob,1))+' ') output.write('coprob:'+str(round(coprob,1))+' ') output.write('finper:'+str(round(finper,1))+' ') output.write('hlper:'+str(round(hlper,1))+' ') output.write('resper:'+str(round(resper,1))+' ') output.write('coper:'+str(round(coper,1))+' ') output.write('\n') output.close()
def w_find(path_c, fname_c): work_dir = os.curdir os.chdir(work_dir) corp = PlaintextCorpusReader(path_c, fname_c, encoding="utf") text = nltk.Text(corp.words()) ## for i in text: ## j = str(i) ## k = re.finditer("Meraki",j) ## for count, l in enumerate(k): ## t_count = count ## print("Total matches found:",t_count) with open("words_list.txt", "r") as f: words = f.read() word = words.split('\n') for x in word: print("Fetching match for word :", str(x), "in file : ", fname_c) text.concordance(str(x)) print(x) print() print("----------------------")
def getHighFreqWords(): maxlen = 15 maxlen1 = 5 corpath = '' wordlist = PlaintextCorpusReader(corpath, '.*') allwords = nltk.Text(wordlist.words('temp.txt')) stop = [] swords = [i for i in allwords if i not in stop] fdist = nltk.FreqDist(swords) with open('highFreqWords.txt', 'w', encoding='utf-8') as file: for item in fdist.most_common(nWords): # print(item,item[0]) word0 = item[0] try: q = getTranslation(item[0]) except Exception as e: print(e) if not q: continue while len(word0) < maxlen: word0 += ' ' num = str(item[1]) while len(num) < maxlen1: num = ' ' + num file.write(word0 + ' ' + num + ' ') for translate in q: file.write(translate + ' ') file.write("\n")
def extractWordsOnly(self, article): templist = [] listtextstring = [] articlename = article + '.txt' #corpus_root = '/home/jesal/onedump/' wl = PlaintextCorpusReader(corpus_root, '.*') allwords = wl.words(fileids = articlename) exturllist = self.extractexternalURL(article) textstring = wl.raw(articlename) for item in exturllist: textstring = textstring.replace(item,' ') #templist = re.sub(r'[.!,;?]', ' ', textstring).split() templist = nltk.word_tokenize(textstring) listtemp = [] for i in templist: j = re.sub('[^A-Za-z]+', '', i) listtemp.append(str(j)) templistfinal = [] templistfinal= self.removeEmpty(listtemp) return templistfinal
def similar (text, word): if re.match ("^[a-zA-Z0-9_\(\),\.]+$", text) and re.match ("^[a-zA-Z0-9_]+$", word): text = '%s.txt' % text f = open(os.path.join(CORPUS_ROOT, text), 'r') source = f.read() f.close() corpus = PlaintextCorpusReader(CORPUS_ROOT, [text]) n_text = nltk.text.Text(corpus.words(text)) context_index = nltk.text.ContextIndex(n_text.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) word = word.lower() wci = context_index._word_to_contexts result = [] if word in wci.conditions(): contexts = set(wci[word]) fd = nltk.probability.FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = nltk.util.tokenwrap(fd.keys()[:20]) for middle_word in words.split(' '): for context in contexts: if re.search ("/" + context[0] + "(\W|\s)+" + middle_word + "(\W|\s)+" + context[1] + "/i", source) != 'none': print (context[0], middle_word, context[1]) result.append ({'word': word, 'context_left': context[0], 'context_right': context[1]}) return dumps ({'name': text, 'word': word, 'result': result})
def noun_filter(): '''filter letters so only the nouns come through''' direc = "../Text/missing_letters/" #direc = "../../Letters/" stopWords = set(stopwords.words('german')) with open('../POS_tagger/nltk_german_classifier_data.pickle', 'rb') as t: tagger = pickle.load(t) wordlist = PlaintextCorpusReader(direc, ".*") for file in os.listdir(direc): text = "" filtered = "" filename = os.path.join(direc, file) to_tag = wordlist.words(file) tagged = tagger.tag(to_tag) path = "../Text/tagged_letters/" with open('%stagged_%s.pickle' % (path, file.replace(".txt", "")), 'wb') as f: pickle.dump(tagged, f) nouns = [] for word in tagged: if word[1] == 'NN': nouns.append(word[0]) for w in nouns: word = w.casefold().strip() if word not in stopWords: filtered += w + " " name = filename.replace(direc, "") #name = name.replace("html", "txt") intoTxt(filtered, file.replace("html", "txt"))
def create_stopword_list(mfw, corpus_dir, stopwords_out): """ Creates a stop word list for a collection of text files. The most frequent words of the collection are used as stop words. How many of the MFW should be used, can be indicated with the mfw parameter. author: uhk Arguments: mfw (int): number of MFW to consider as stop words corpus_dir (str): path to the corpus directory stopwords_out (str): path to the output stopwords file """ print("\nLaunched create_stopword_list.") from nltk.corpus import PlaintextCorpusReader from nltk.probability import FreqDist corpus = PlaintextCorpusReader(corpus_dir, '.*') fdist_corpus = FreqDist(corpus.words()) with open(stopwords_out, "w", encoding="utf-8") as stopwords_out_file: # get the most frequent words mfw_list = [w[0] for w in fdist_corpus.most_common(mfw)] # write stop word list to file stopwords_out_file.write("\n".join(mfw_list)) print("Done.")
def trigramModel(corpus): newcorpus = PlaintextCorpusReader(corpus, "nlp_project2_corpus.txt") newcorpus.raw("nlp_project2_corpus.txt") newcorpus.sents("nlp_project2_corpus.txt") enwords = newcorpus.words("nlp_project2_corpus.txt") entext = newcorpus.raw("nlp_project2_corpus.txt") entokens = nltk.word_tokenize(entext) # Applying trigram to sentence trigram = nltk.trigrams(entokens) trigrams_freq = nltk.FreqDist(trigram) ourTextArr2 = [] counter = 0 prob = 0 trigramCounter = 0 probBiGram = 0 bigrams = nltk.bigrams(entokens) bigrams_freq = nltk.FreqDist(bigrams) ourTextArr = [] bigramCounter = 0 for i in bigrams_freq.most_common(): bigramCounter += 1 for i in trigrams_freq.most_common(): trigramCounter += 1 for i, j in trigrams_freq.most_common(): if prob > 0.50: print("********PROBB****: ", prob) if (j > 0): for k, l in bigrams_freq.most_common(): if (j > 2): probBiGram += l / (bigramCounter / 10) prob += j / (trigramCounter / 10) prob = ((prob + probBiGram) - (prob * probBiGram)) / trigramCounter if prob > 0.45: str1 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[0]) str2 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[1]) str3 = re.sub(r'[^a-zA-Z0-9_\s]+', '', i[2]) ourTextArr2.append(str1 + " " + str2 + " " + str3) if (len(ourTextArr2) > 200): break ourTextArr2 = list(set(ourTextArr2)) finalText2 = "" counter3 = 0 ourTextArr2.reverse() for i in range(len(ourTextArr2)): counter3 += 1 finalText2 += " " + ourTextArr2[i] print(finalText2)
class Document(object): """ A container object for a set of chapters. This allows us to keep track of document frequencies when computing them the first time so we don't repeat computations for common words. It also handles the PlaintextCorpusReader functions for us. """ def __init__(self, chapter_paths): """ Create a new Document. chapter_paths - A list of the paths for chapters in the document. """ self.corpus = PlaintextCorpusReader("", chapter_paths) self.chapter_lists = self._sanitize_chapters() self.chapter_dists = [(FreqDist(chapter), chapter) for chapter in self.chapter_lists] self.words = {} def get_chapters(self): return self.chapter_lists def average_chapter_frequency(self, word): freqs = [] if word in self.words: return self.words[word] else: for (dist, wordlist) in self.chapter_dists: freqs.append(dist[word]/float(len(wordlist))) # Store and return the average frequency avg_frq = mean(freqs) self.words[word] = avg_frq return avg_frq def _sanitize_chapters(self): # Sanitize the wordlists and return them lists = [self.corpus.words(file_id) for file_id in self.corpus.fileids()] new_lists = [] for word_list in lists: # Convert everything to lowercase (e.g. so "the" and "The" match) word_list = [word.lower() for word in word_list] # Remove any punctuation word_list = [re.sub('\p{P}','',word) for word in word_list] # Remove stopwords, punctuation, and any empty word stops = stopwords.words('english') stops.append('') stops.append('said') word_list = [word for word in word_list if (word not in stops and word.isalpha())] new_lists.append(word_list) return new_lists
def main(): corpus_root = '../posts/' newcorpus = PlaintextCorpusReader(corpus_root, '.*', para_block_reader=read_block_no_metadata) corpus_words = [w.lower() for w in newcorpus.words() if w.isalpha()] corpus_sentences = newcorpus.sents() analyst = TextAnalyst(corpus_words, corpus_sentences, 'french') analyst.print_analyze()
def loading_corpus(): from nltk.corpus import PlaintextCorpusReader corpus_root = "wiki_corpus" wordlists = PlaintextCorpusReader(corpus_root, '.*') print(wordlists.fileids()) print(wordlists.words()) tokens = nltk.word_tokenize(wordlists.raw()) print(tokens)
def read_BNC_baby_stem(root_local): global fdist BNC_baby = [] stemmer = SnowballStemmer("english") wordlists = PlaintextCorpusReader(root_local, '.*', encoding='latin-1') for word in wordlists.words(): BNC_baby.append(stemmer.stem(word)) fdist = FreqDist(word.lower() for word in BNC_baby) return(fdist)
def get_corpus(corpusdir): newcorpus = PlaintextCorpusReader(corpusdir, '.*') titles = newcorpus.fileids() # returns all the .txt files in the dir words = [] for title in titles: newcorpus_txt = newcorpus.words(title) words.extend([ e for e in newcorpus_txt if re.match(r"[aA-zZ]",e)]) return words
def extractingFromFolders(): folder2 = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\Reference') fileresult = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\results.txt') refer = PlaintextCorpusReader(folder2, 'harrygrepster.txt') grepster = refer.words() results = open(fileresult, 'a') completeWords = wordlist.words() stoppers = stopwords.words() return grepster, results, completeWords, stoppers
def plot_freq(self, corpus, patt, n): wordlists = PlaintextCorpusReader(corpus, patt) fileids = wordlists.fileids() words = [] for id in fileids: words = append(words, wordlists.words(id)) fre = FreqDist(word.lower() for word in words if word.isalpha()) fre.tabulate(n) return fre.plot(n)
def read_BNC_baby_stem(root_local): global fdist BNC_baby = [] stemmer = SnowballStemmer("english") wordlists = PlaintextCorpusReader(root_local, '.*', encoding='latin-1') for word in wordlists.words(): BNC_baby.append(stemmer.stem(word)) fdist = FreqDist(word.lower() for word in BNC_baby) return (fdist)
def get_corpus(corpusdir): newcorpus = PlaintextCorpusReader(corpusdir, '.*') titles = newcorpus.fileids() # returns all the .txt files in the dir words = [] for title in titles: newcorpus_txt = newcorpus.words(title) words.extend([e for e in newcorpus_txt if re.match(r"[aA-zZ]", e)]) return words
def train(): # Reads in data, preprocesses and trains it in a Naive Bayes Classifier and returns the classifier object neg = PlaintextCorpusReader('C:\\Users\\Darren\\Downloads\\aclImdb\\train\\neg', '.+\.txt') pos = PlaintextCorpusReader('C:\\Users\\Darren\\Downloads\\aclImdb\\train\\pos', '.+\.txt') neg_docs1 = [neg.words(fid) for fid in neg.fileids()] pos_docs1 = [pos.words(fid) for fid in pos.fileids()] # Combine the categories of the corpus all_docs1 = neg_docs1 + pos_docs1 num_neg_docs = len(neg_docs1) # Processsing for stopwords, alphabetic words, Stemming all_docs2 = [[w.lower() for w in doc] for doc in all_docs1] print("lowering done") import re all_docs3 = [[w for w in doc if re.search('^[a-z]+$',w)] for doc in all_docs2] print("regex done") from nltk.corpus import stopwords stop_list = stopwords.words('english') all_docs4 = [[w for w in doc if w not in stop_list] for doc in all_docs3] print("stopword done") stemmer = PorterStemmer() all_docs5 = [[stemmer.stem(w) for w in doc] for doc in all_docs4] #Create dictionary dictionary = corpora.Dictionary(all_docs5) # print(dictionary) # Export as a text file to use with the pickled classifier dictionary.save_as_text("dictionary.txt") # Convert all documents to TF Vectors all_tf_vectors = [dictionary.doc2bow(doc) for doc in all_docs5] #Label the taining data. Since the folder name is the label, I use the same labels. all_data_as_dict = [{id:1 for (id, tf_value) in vec} for vec in all_tf_vectors] neg_data = [(d, 'negative') for d in all_data_as_dict[0:num_neg_docs-1]] pos_data = [(d, 'positive') for d in all_data_as_dict[num_neg_docs:]] all_labeled_data = neg_data + pos_data #Generate the trained classifier #Can use max entropy as well classifier = nltk.NaiveBayesClassifier.train(all_labeled_data) return classifier, dictionary
def load_vocab(self, root='.', files='.*'): """ Load new vocabulary. :param root: the root directory for the corpus. :param files: A list or regexp specifying the files in this corpus. """ voc = PlaintextCorpusReader(root, files) for word in voc.words(): self.vocab[word.lower()] += 1
def read_text(path): if os.path.isfile(path) == True: raw = open(path, 'r').read() tokens = nltk.word_tokenize(raw) text = [token.lower() for token in tokens] elif os.path.isdir(path) == True: filelists = PlaintextCorpusReader(path, '.*') tokens = filelists.words() text = [token.lower() for token in tokens] return nltk.Text(text)
def spimi_corpus_process(path_corpus,file_names,block_size): from nltk.corpus import PlaintextCorpusReader wordlists = PlaintextCorpusReader(path_corpus,file_names,encoding='latin-1') block = [] for fileid in wordlists.fileids(): docid = fileid[:fileid.rfind(".")][-1:] block += [(word,docid) for word in wordlists.words(fileid)] while len(block)!=0: try: count = spimi_invert([block.pop() for x in xrange(block_size)]) except IndexError as ie: pass
def create_LM_on_dataset(dataset): corpus_root = '/home1/c/cis530/data-hw2/Language_model_set/' dataset_path = corpus_root + dataset files = PlaintextCorpusReader(dataset_path, '.*') ids = files.fileids() for i in range(len(ids)): words = files.words(ids[i]) lang_model = NGramModel2(words,2) return lang_model
def concordance(text): """Returns an alphabetical list of words for the given text.""" corpus = PlaintextCorpusReader(CORPUS_ROOT, [text]) n_text = nltk.text.Text(corpus.words(text)) interesting = [ 'data', 'grey', 'literature', 'relation', 'user', 'information', 'error', 'value', 'other', ] # TODO: use NLTK built-in functions for this! word_list = map(str.lower, list(set(list(corpus.words())))) word_list.sort() #word_list = "<br />".join(word_list) return template('templates/split', word_list=word_list, text=text)
def stemming_files(self, source_folder, destination_folder): if not os.path.exists(destination_folder): os.makedirs(destination_folder) corpus_news = PlaintextCorpusReader(source_folder, '.*\.txt') for file in corpus_news.fileids(): file_name = os.path.basename(os.path.normpath(file)) words = corpus_news.words(file) stemmed_content = self.stemming_text(words) with open(destination_folder + "/" + file_name, 'w', encoding='utf8') as modified: modified.write(' '.join(stemmed_content))
def textinfo(path): """ Takes a file path and returns figures about the text file contained therein. """ from nltk.corpus import PlaintextCorpusReader from nltk import FreqDist corpusReader = PlaintextCorpusReader(text, '.*') print "Total word count:", len([word for sentence in corpusReader.sents() for word in sentence]) print "Unique words:", len(set(corpusReader.words())) print "Sentences:", len(corpusReader.sents()) print "Average sentence length in words:", (len([word for sentence in corpusReader.sents() for word in sentence]) / len(corpusReader.sents()))
def main(): # Corpus Location #for training data posTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_train' negTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_train' #for test data posTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_test' negTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_test' # Create Plain Text Corpus for training data posCorpus = PlaintextCorpusReader(posTrainCorpus, '.*') negCorpus = PlaintextCorpusReader(negTrainCorpus, '.*') # Create Plain Text Corpus for test data posTstCorpus = PlaintextCorpusReader(posTestCorpus, '.*') negTstCorpus = PlaintextCorpusReader(negTestCorpus, '.*') #GetBigrams posBigrams = nltk.bigrams(posCorpus.words()) negBigrams = nltk.bigrams(negCorpus.words()) #Get no. of words per corpus posWordLen = len(posCorpus.words()) negWordLen = len(negCorpus.words()) # Creating object of Lang_Model_classifier obj1 = Lang_Model_Classifier() obj1.freq_dst(posCorpus, negCorpus) #For negative test data for filename in os.listdir(negTestCorpus): wordSet = negTstCorpus.words(filename) print '**Unigram**' unigr = obj1.perp(wordSet) print unigr print '**Bigram**' bigr = obj1.perpBi(nltk.bigrams(wordSet)) print bigr #For positive test data for filename in os.listdir(posTestCorpus): wordSet2 = posTstCorpus.words(filename) print '**Unigram**' posunigr = obj1.perp(wordSet2) print posunigr print '**Bigram**' posbigr = obj1.perpBi(nltk.bigrams(wordSet2)) print posbigr
def repl(): ''' This is the Read-Eval-Print loop for the dialogue. ''' # Setup the dictionary, preprocessing print "You'll have to pardon me, at my age, it takes several moments to memorize all of Shakespeare..." #shake = gutenberg.words('shakespeare-caesar.txt') #shake = gutenberg.words('shakespeare-complete.txt') #print "Done with getwords" pcr = PlaintextCorpusReader(".", 'shakespeare.*') shake = pcr.words("shakespeare-complete.txt") imps = getNimportant(shake,500) print imps #print "Done with get imps" # divide the text into blocks of 3000 words (split on periods?) # store blocks? Hmm. or just read from shake by line, based on block number # can actually just index each word. # need a way to index the text kps = [] for word in imps: #kps.append(KeyPhrase(word, getPhrases(word, shake))) kps.append(KeyPhrase(word)) #print "Done with kps stuff" #print imps # Define words that will exit the program goodbyeWords = ["quit", "bye", "goodbye", "q", "exit", "leave"] # Greetings print "Ah, finally someone who will speak Shakespeare with me! How do you do, sir?" print # Main loop while True: # Prompt text = raw_input('> ').lower() print # Exit strategy if text in goodbyeWords: print "Goodbye!" break # Answer provideAnswer(text, kps, shake) print
def carga_mongodb(): client = pymongo.MongoClient(MONGODB_URI) db = client.docs docs=db.DOCS spanish_stops = set(stopwords.words('spanish')) newcorpus = PlaintextCorpusReader(corpus_root, '.*') newcorpus.fileids() for fileid in newcorpus.fileids(): try: num_words = len(newcorpus.words(fileid)) words = newcorpus.words(fileid) # num_sents = len(newcorpus.sents(fileid)) # print(newcorpus.raw(fileid)) #bcf = BigramCollocationFinder.from_words(words) #filter_stops = lambda w: len(w) < 3 or w in spanish_stops #bcf.apply_word_filter(filter_stops) tags_array=vocab_words(newcorpus.raw(fileid)) tags=tags_array[0] tags_vocab=tags_array[1] cloud=tags_array[2] total_cloud=[] for c in cloud: reg={} reg['word']=c[0] reg['total']=c[1] total_cloud.append(reg) #insertamos el documento post = {"nombre": fileid, "fecha": datetime.datetime.utcnow(), "texto":preparar_texto(newcorpus.raw(fileid)), "tags_vocab":tags_vocab, "tags":tags, "enc":random.randint(1, 50), "pos":random.randint(1, 10), "neg":random.randint(1, 5), "num_words":num_words, "cloud":total_cloud} post_id = docs.insert_one(post).inserted_id except: print("Importacion Fallida:" + fileid)