def extract_related_terms(self): re = ReportEnviroments() new_corpus_clusters_fileids_list = PlaintextCorpusReader(re.cluster_corpus_path, '.*') raw_text_list = [] for i in range(len(new_corpus_clusters_fileids_list.fileids())): raw_text_list.extend([[new_corpus_clusters_fileids_list.raw(fileids=new_corpus_clusters_fileids_list.fileids()[i])]]) return raw_text_list
def fileids(self, years='*'): """ Returns list all files or files exist in specific folder(s) >>> len(hr.fileids()) 3206 >>> len(hr.fileids(years=1996)) 157 >>> len(hr.fileids(years=[1996,2007])) 246 >>> hr.fileids()[0] '1996/HAM2-960622.xml' """ if type(years) is int: years = [str(years)] if years=='*': wordlists = PlaintextCorpusReader(self.hamshahri_root, '.*\.xml') fids = wordlists.fileids() return fids else: fids = [] for year in years: wordlists = PlaintextCorpusReader(self.hamshahri_root, str(year) + '/.*\.xml') fids = fids + wordlists.fileids() return fids
def main(): current_directory = os.path.dirname(__file__) corpus_root = os.path.abspath(current_directory) wordlists = PlaintextCorpusReader(corpus_root, 'Islip13Rain/.*\.txt') wordlists.fileids() ClassEvent = nltk.Text(wordlists.words()) CEWords = ["Long Island", "Weather Service", "flooding", "August", "heavy rains", "Wednesday", "Suffolk County", "New York", "rainfall", "record"] # ClassEvent Statistics print "--------- CLASS EVENT STATISTICS -------------" print "ClassEvent non stopwords", non_stopword_fraction(ClassEvent) print "ClassEvent WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(ClassEvent) print "ClassEvent PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(ClassEvent, CEWords) ClassEventLettersPerWord = average_letters_per_word(ClassEvent) ClassEventWordsPerSent = len(wordlists.words()) / len(wordlists.sents()) ClassEventARI = (4.71 * ClassEventLettersPerWord) + (0.5 * \ ClassEventWordsPerSent) - 21.43 print "Average number of letters per word", ClassEventLettersPerWord print "Average number of words per sentence:", ClassEventWordsPerSent print "Automated Readability Index:", ClassEventARI print wordlists_event = PlaintextCorpusReader(corpus_root, "Texas_Wild_Fire/.*\.txt") wordlists_event.fileids() YourSmall = nltk.Text(wordlists_event.words()) SmallEventWords = ["Fire", "Wildfire", "Water", "Damage", "Ground", "Burn", "Town", "Heat", "Wind", "Speed", "Size", "City", "People", "Home", "Weather", "Debris", "Death", "Smoke", "State", "Ash"] # YourSmall statistics print "--------- YOUR SMALL STATISTICS --------------" print "Texas_Wild_Fire", non_stopword_fraction(YourSmall) print "YourSmall WORD LENGTH DISTRIBUTIONS:" print_word_length_distributions(YourSmall) print "YourSmall PERCENTAGE OF WORD OCCURRENCES:" print_percentage_of_word_in_collection(YourSmall, SmallEventWords) YourSmallLettersPerWord = average_letters_per_word(YourSmall) YourSmallWordsPerSent = len(wordlists_event.words()) / \ len(wordlists_event.sents()) YourSmallARI = (4.71 * YourSmallLettersPerWord) + (0.5 * \ YourSmallWordsPerSent) - 21.43 print "Average number of letters per word", YourSmallLettersPerWord print "Average number of words per sentence:", YourSmallWordsPerSent print "Automated Readability Index", YourSmallARI
def carga(): client = pymongo.MongoClient(MONGODB_URI) db = client.docs docs=db.SIMILITUD completo=[] newcorpus = PlaintextCorpusReader(corpus_root, '.*') result={} for fileid in newcorpus.fileids(): for file2 in newcorpus.fileids(): result= {"f1": fileid, "f2":file2, "value": compare_texts(newcorpus.raw(fileid), newcorpus.raw(file2))} docs.insert_one(result).inserted_id
def loadCorpora(): corpus_root = '/usr/share/dict' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() wordlists.words('connectives') corpus_root = r"C:\corpora\penntreebank\parsed\mrg\wsj" file_pattern = r".*/wsj_.*\.mrg" ptb = BracketParseCorpusReader(corpus_root, file_pattern) ptb.fileids() len(ptb.sents()) ptb.sents(fileids='20/wsj_2013.mrg')[19]
class Documents: def __init__(self, root): self.files = PlaintextCorpusReader(root, '.*\.txt') self.posting = {} self.idf = {} self.file_length = {} self.file_id_names = {} self.N = len(self.files.fileids()) def process(self): for idx, file in enumerate(self.files.fileids()): print idx filename = file.strip('.txt') self.file_id_names[idx] = filename text = self.files.raw(file) words = process(text) if settings['phrase_query']: raw_words = raw_process(text) if words.values(): self.file_length[idx] = normalization(words.values()) for word, freq in words.iteritems(): if self.idf.has_key(word): self.idf[word] += 1 else: self.idf[word] = 1 if not self.posting.has_key(word): self.posting[word] = {} if settings['phrase_query']: self.posting[word][idx] = indices(raw_words, word) else: self.posting[word][idx] = freq for word, idf in self.idf.iteritems(): self.posting[word]['idf'] = idf def dump(self): posting_pickle = open('posting.pkl', 'wb') for term, value in self.posting.iteritems(): self.posting[term] = str(value) pickle.dump(self.posting, posting_pickle, 2) posting_pickle.close() length_pickle = open('file_length.pkl', 'wb') pickle.dump(self.file_length, length_pickle, 2) length_pickle.close() file_ids_pickle = open('file_ids.pkl', 'wb') pickle.dump(self.file_id_names, file_ids_pickle, 2) file_ids_pickle.close()
def represent_docs(corpus,cat,dictio_classes,categories): docs_train = [] for dirs in os.walk(corpus): corpus_root = dirs[0] #parcour l'arborescence du chemin if corpus_root != corpus: if os.path.basename(corpus_root) == cat: dictio = dictio_classes[cat] textlist = PlaintextCorpusReader(corpus_root,'.*') for files in textlist.fileids(): test= corpus_root + '/' + files x = open(test,'r') l=dictio.items() l.sort(key=itemgetter(1),reverse=True) l=l[:2000] l=dict(l) for mot,fval in l.items(): val=fval for ligne in x: if ligne.find(mot)>0: l[mot]=val else: l[mot]=0.0 x.close() docs_train.append((l,'Yes')) else: if os.path.basename(corpus_root) in categories: cat_else = os.path.basename(corpus_root) dictio = dictio_classes[cat_else] textlist = PlaintextCorpusReader(corpus_root,'.*') for files in textlist.fileids(): test= corpus_root + '/' + files x = open(test,'r') l=dictio.items() l.sort(key=itemgetter(1),reverse=True) l=l[:2000] l=dict(l) for mot,fval in l.items(): val_1=fval for ligne in x: if ligne.find(mot)>0: l[mot]=val_1 else: l[mot]=0.0 x.close() docs_train.append((l,'No')) return docs_train
def preprocTrain(corpus, tf_file, vocab_file): global MIN_FREQ stopwds = stopwords.words('english') TF = {} #gets the freq for each token filter_TF = {} #get the freq for each token having freq > minFreq feature_train = {} #final features for training class. Passed on to write ARFF files vocabulary = [] ctDocs = {} totalDocs = 0 minFreq = MIN_FREQ TrainingFiles = {} #loading our corpus corpus_root=corpus wordlists = PlaintextCorpusReader(corpus_root, '.*') ctDocs = len(wordlists.fileids()) #total no of files in each class totalDocs = ctDocs + totalDocs #total no of files TrainingFiles = wordlists.fileids() #contains files for each class sys.stderr.write("Reading corpus") for fileid in wordlists.fileids(): sys.stderr.write(".") raw = wordlists.raw(fileid) tokens = nltk.word_tokenize(raw) text = nltk.Text(tokens) words = [w.lower() for w in text if w.isalnum() and w.lower() not in stopwds and len(w) > 3] vocab = set(words) words = nltk.Text(words) #calculate TF TF[fileid] = {fileid:fileid} filter_TF[fileid] = {fileid:fileid} for token in vocab: TF[fileid][token] = freq(token, words) if TF[fileid][token] > minFreq: #min feature freq. vocabulary.append(token) filter_TF[fileid][token] = tf(TF[fileid][token],words) pickle.dump(filter_TF, open(tf_file, "wb")); sys.stderr.write("done\nCalculating TF*IDF scores") all_vocabulary = list(set(vocabulary)) pickle.dump(all_vocabulary, open(vocab_file, "wb")); #featureIDF = idf(totalDocs,filter_TF,all_vocabulary) pprint(TF, stream=sys.stderr)
def plot_cfreq(self,corpus,patt,n): wordlists = PlaintextCorpusReader(corpus,patt) fileids = wordlists.fileids() for id in fileids: words = wordlists.words(id) fre = FreqDist(word.lower() for word in words if word.isalpha()) return fre.plot(n,cumulative=True)
def tokenisation (path): tokens = [] min_length = 3 for dirs in os.walk(path): corpus_root = dirs[0] #parcour l'arborescence du chemin if corpus_root != path: textlist = PlaintextCorpusReader(corpus_root,'.*') for files in textlist.fileids(): test= corpus_root + '/' + files fs = open(test,'r') texte=fs.readlines() texte=str(texte) words = map(lambda word: word.lower(), wordpunct_tokenize(texte)) j=0 while j<len(words): if words[j] not in cachedStopWords: tokens.append(words[j] ) j+=1 fs.close() p = re.compile('[a-zA-Z]+') tokens_filtered = filter(lambda token: p.match(token) and len(token)>= min_length, tokens) # vocab = [] # for words in tokens_filtered: # vocab.append(SnowballStemmer("english").stem(words)) # tokens_filtered_sans = set(vocab) tokens_filtered_sans = set(tokens_filtered) tokens_filtered_sans = list(tokens_filtered_sans) return tokens_filtered_sans
def get_coarse_level_features(dataset, output_file): # Import the corpus reader corpus_root = '/home1/c/cis530/data-hw2/'+dataset # Define the folder where the files are situated files_dataset = PlaintextCorpusReader(corpus_root, '.*') # Open the output_file output = open('/home1/c/cis530/data-hw2/'+output_file,'w') # Read the stopwlist stop_list = open('/home1/c/cis530/data-hw2/'+'stopwlist.txt').read() types_stop_list=stop_list.split() for fileid in files_dataset.fileids(): # Output the docid output.write(dataset+'/'+fileid+' ') # Output the topic_name topic_name=fileid.split('/')[0] output.write(topic_name+' ') # Output the num_tokens tokens=files_dataset.words(fileid) output.write('tok:'+str(len(tokens))+' ') # Output the num_types types=set(tokens) output.write('typ:'+str(len(types))+' ') # Output the num_contents output.write('con:'+str(len([w for w in tokens if w not in types_stop_list]))+' ') # Output the num_sents sents = files_dataset.sents(fileid) output.write('sen:'+str(len(sents))+' ') # Output the avg_slen avg_slen=round(float(len(tokens))/float(len(sents)),2) output.write('len:'+str(avg_slen)+' ') # Output the num_caps output.write('cap:'+str(len([w for w in tokens if w[0]>='A' and w[0]<='Z']))) output.write('\n') output.close()
def get_lm_features(dataset,output_file): # Import the corpus reader corpus_root = '/home1/c/cis530/data-hw2/'+dataset # Define the folder where the files are situated files_dataset = PlaintextCorpusReader(corpus_root, '.*') fin_model = BigramModel('Finance',corpus_root) hel_model = BigramModel('Health',corpus_root) res_model = BigramModel('Computers_and_the_Internet',corpus_root) co_model = BigramModel('Research',corpus_root) output = open('/home1/c/cis530/data-hw2/'+output_file,'w') for fileid in files_dataset.fileids(): # Output the docid output.write(dataset+'/'+fileid+' ') # Output the topic_name topic_name=fileid.split('/')[0] output.write(topic_name+' ') word_list = files_dataset.words(fileid) finprob,finper = fin_model.get_prob_and_per(word_list) hlprob,hlper = hel_model.get_prob_and_per(word_list) resprob,resper = res_model.get_prob_and_per(word_list) coprob,coper = co_model.get_prob_and_per(word_list) output.write('finprob:'+str(round(finprob,1))+' ') output.write('hlprob:'+str(round(hlprob,1))+' ') output.write('resprob:'+str(round(resprob,1))+' ') output.write('coprob:'+str(round(coprob,1))+' ') output.write('finper:'+str(round(finper,1))+' ') output.write('hlper:'+str(round(hlper,1))+' ') output.write('resper:'+str(round(resper,1))+' ') output.write('coper:'+str(round(coper,1))+' ') output.write('\n') output.close()
def save_my_count(self,corpus,patt,n,filename): wordlists = PlaintextCorpusReader(corpus,patt) fileids = wordlists.fileids() res = [] for id in fileids: leng = len(wordlists.words(id)) wordc = len(set(wordlists.words(id))) wor = "=> corpus tokens: " + `leng` + "\n" dis = "=> corpus token types: " + `wordc` + "\n" ric = "=> ind lex richness: " + `leng / wordc` + "\n" res.append(dis) res.append(ric) res.append(wor) for word in sorted(set(wordlists.words(id))): freq = (wordlists.words(id)).count(word) f = "(" + word.lower() + "," + `round(100 * (freq / leng),1)` + ")\n" t = "(" + word.lower() + "," + `freq` + "/" + `leng` + ")" res.append(f) res.append(t) out = open("../data/"+filename,"w") try: for t in res[:n]: out.write(t + "\n") finally: out.close()
class Document(object): """ A container object for a set of chapters. This allows us to keep track of document frequencies when computing them the first time so we don't repeat computations for common words. It also handles the PlaintextCorpusReader functions for us. """ def __init__(self, chapter_paths): """ Create a new Document. chapter_paths - A list of the paths for chapters in the document. """ self.corpus = PlaintextCorpusReader("", chapter_paths) self.chapter_lists = self._sanitize_chapters() self.chapter_dists = [(FreqDist(chapter), chapter) for chapter in self.chapter_lists] self.words = {} def get_chapters(self): return self.chapter_lists def average_chapter_frequency(self, word): freqs = [] if word in self.words: return self.words[word] else: for (dist, wordlist) in self.chapter_dists: freqs.append(dist[word]/float(len(wordlist))) # Store and return the average frequency avg_frq = mean(freqs) self.words[word] = avg_frq return avg_frq def _sanitize_chapters(self): # Sanitize the wordlists and return them lists = [self.corpus.words(file_id) for file_id in self.corpus.fileids()] new_lists = [] for word_list in lists: # Convert everything to lowercase (e.g. so "the" and "The" match) word_list = [word.lower() for word in word_list] # Remove any punctuation word_list = [re.sub('\p{P}','',word) for word in word_list] # Remove stopwords, punctuation, and any empty word stops = stopwords.words('english') stops.append('') stops.append('said') word_list = [word for word in word_list if (word not in stops and word.isalpha())] new_lists.append(word_list) return new_lists
def get_sub_directories(directory): files = PlaintextCorpusReader(directory, ".*") dirs = list() for f in files.fileids(): if "/" in f: if (f[:f.index("/")] not in dirs): dirs.append(f[:f.index("/")]) return dirs
def _compute_unigram_frequency(self): wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*') tokenizer = TreebankWordTokenizer() total = len(wordlists.fileids()) count = 0 fdist = nltk.FreqDist() for fl in wordlists.fileids(): count += 1 fl_abs_path = os.path.join(self.prepared_training_data_root, fl) with open(fl_abs_path, 'r') as f: words = tokenizer.tokenize(f.read()) fdist.update(words) print 'freqdist: %s of %s' % (count, total) with open(os.path.join(self.corpus_root, 'unigram_frequency.txt'), 'w') as f: f.writelines(['%s %s\n' % (word, freq) for (word, freq) in fdist.items()]) return None
def _compute_biagram_frequency(self): if not os.path.exists(self.bigram_frequency_dir): os.mkdir(self.bigram_frequency_dir) wordlists = PlaintextCorpusReader(self.prepared_training_data_root, '.*') tokenizer = TreebankWordTokenizer() total = len(wordlists.fileids()) count = 0 for fl in wordlists.fileids(): count += 1 print 'freqdist: %s of %s' % (count, total) fl_abs_path = os.path.join(self.prepared_training_data_root, fl) with open(fl_abs_path, 'r') as f: words = tokenizer.tokenize(f.read()) bi_words = nltk.bigrams(words) fdist = nltk.FreqDist(bi_words) with open(os.path.join(self.bigram_frequency_dir, fl), 'w') as f: f.writelines(['%s %s %s\n' % (word[0], word[1], freq) for (word, freq) in fdist.items()]) return None
def get_corpus(corpusdir): newcorpus = PlaintextCorpusReader(corpusdir, '.*') titles = newcorpus.fileids() # returns all the .txt files in the dir words = [] for title in titles: newcorpus_txt = newcorpus.words(title) words.extend([ e for e in newcorpus_txt if re.match(r"[aA-zZ]",e)]) return words
def get_coarse_level_features(dataset, output_file): # accessing the corpus corpus_root = '/home1/c/cis530/data-hw2/' dataset_path = corpus_root + dataset # Reading the files from the directories files = PlaintextCorpusReader(dataset_path, '.*') ids = files.fileids() stopFile = PlaintextCorpusReader(corpus_root, 'stopwlist.txt') stops = stopFile.words() #Opening a file that has to be written to out = open(output_file, 'w') for i in range(0,len(ids) - 1): #Initializing certain variables tokens_count=0 types = 0 non_stops_count=0 sents_count = 0 avg_sent_len=0 cap_count = 0 tokens=files.words(ids[i]) #Computing Number of Tokens tokens_count = len(tokens) #Computing Number of types types = len(set(tokens)) non_stops=[] #Computing Number of Content Words for t in tokens: if t not in stops: non_stops.append(t) non_stops_count = len(non_stops) #Finding Average Sentence Length sent = [] sent = files.sents(ids[i]) sents_count = len(sent) sent_len=0 for s in sent: sent_len = sent_len + len(s) avg_sent_len = sent_len/float(sents_count) #Computing Number of Captilized Words for c in non_stops: if c.istitle(): cap_count = cap_count+1 current_file = dataset + '/' + ids[i] e = current_file.split('/') out.write(current_file +' '+ e[-2] + ' tok:' + str(tokens_count) + ' typ:' + \ str(types) + ' con:' + str(non_stops_count) + ' sen:' + str(sents_count) + ' len:' + str(avg_sent_len) + ' cap:' + str(cap_count)+ '\n') out.flush()
def calculate_errors(self, classifier, word_features, test_path, log=0): # load the pickle file with the classifier progress corpus_news = PlaintextCorpusReader(test_path, '.*\.txt') Gold = [] Test = [] TP = 0 TN = 0 FP = 0 FN = 0 Errors = [] for file in corpus_news.fileids(): category = file.split(self.DELIMITER)[-1].split('--')[0] Gold.append(category) words = corpus_news.words(file) testing_set = get_features(set(words), word_features) result = classifier.classify(testing_set) Test.append(result) if category == result == 'attack': TP += 1 elif category == result == 'nonattack': TN += 1 elif category != 'attack' and result == 'attack': Errors.append('{} : false positive {}'.format(file, result)) FP += 1 elif category == 'nonattack' and result == 'attack': Errors.append('{} : false negative {}'.format(file, result)) FN += 1 Accuracy = (TP + TN) / len(corpus_news.fileids()) Precision = TP / (TP + FP) Recall = TP / (TP + FN) F1 = (2 * Precision * Recall) / (Precision + Recall) if log == 1: self.PrintResult(Accuracy, Precision, Recall, F1, Gold, Test) return Errors
def get_coarse_level_features(dataset,output_file): output = open(output_file,'w') root = ROOT+dataset files = PlaintextCorpusReader(root,'.*') for fileid in files.fileids(): __output_header(output,dataset,fileid) __write_coarse(output,files,fileid) output.write('\n') output.close()
def create_LM_on_dataset(dataset): corpus_root = '/home1/c/cis530/data-hw2/Language_model_set/' dataset_path = corpus_root + dataset files = PlaintextCorpusReader(dataset_path, '.*') ids = files.fileids() for i in range(len(ids)): words = files.words(ids[i]) lang_model = NGramModel2(words,2) return lang_model
def get_pos_features(dataset,feature_set_file,output_file): root = ROOT+dataset files = PlaintextCorpusReader(root,'.*') feature_list = open(feature_set_file).read().split() output = open(output_file,'w') for fileid in files.fileids(): __output_header(output,dataset,fileid) __write_pos(output,files,fileid,feature_list) output.write('\n') output.close()
def taille_corpus(corpus): taille = 0 for dirs in os.walk(corpus): corpus_root = dirs[0] #parcour l'arborescence du chemin if corpus_root != corpus: textlist = PlaintextCorpusReader(corpus_root,'.*') for files in textlist.fileids(): test= corpus_root + '/' + files taille += os.path.getsize(test) return taille
def stemming_files(self, source_folder, destination_folder): if not os.path.exists(destination_folder): os.makedirs(destination_folder) corpus_news = PlaintextCorpusReader(source_folder, '.*\.txt') for file in corpus_news.fileids(): file_name = os.path.basename(os.path.normpath(file)) words = corpus_news.words(file) stemmed_content = self.stemming_text(words) with open(destination_folder + "/" + file_name, 'w', encoding='utf8') as modified: modified.write(' '.join(stemmed_content))
def doc_test(corpus,dictio_feature,categories): docs_test = [] for dirs in os.walk(corpus): corpus_root = dirs[0] #parcour l'arborescence du chemin if corpus_root != corpus and os.path.basename(corpus_root) in categories: classe_doc = os.path.basename(corpus_root) textlist = PlaintextCorpusReader(corpus_root,'.*') for files in textlist.fileids(): test= corpus_root + '/' + files doc_cat = represent_doc_test(test, classe_doc, all_dictio) docs_test.append(doc_cat) return docs_test
def get_lm_features(dataset, output_file): corpus_root = '/home1/c/cis530/data-hw2/' bigram_root = corpus_root + 'Language_model_set/' fin_files = PlaintextCorpusReader(bigram_root+'Finance/','.*') fin_words = list(fin_files.words()) fin_model = NGramModel(fin_words, 2) health_files = PlaintextCorpusReader(bigram_root+'Health/','.*') health_words = list(health_files.words()) health_model = NGramModel(health_words, 2) res_files = PlaintextCorpusReader(bigram_root+'Research/','.*') res_words = list(res_files.words()) res_model = NGramModel(res_words, 2) com_files = PlaintextCorpusReader(bigram_root+'Computers_and_the_Internet/','.*') com_words = list(com_files.words()) com_model = NGramModel(com_words, 2) test_files = PlaintextCorpusReader(corpus_root+dataset, '.*') ids = test_files.fileids() out_file = open(output_file,'w') for j in range(0,len(ids)): file_words = test_files.words(ids[j]) out_str = '' current_file = dataset + '/'+ids[j] e = current_file.split('/') out_str = out_str + current_file+ ' '+e[-2] sum_fin=0 sum_health=0 sum_res=0 sum_com=0 text_len = len(file_words) for i in range(1,len(file_words)): sum_fin = sum_fin + math.log(fin_model.prob((file_words[i-1],),file_words[i])) comp_fin = float((-sum_fin)*(1/float(text_len))) sum_health = sum_health + math.log(health_model.prob((file_words[i-1],),file_words[i])) comp_health = (float(-sum_health))*(1/float(text_len)) sum_res = sum_res + math.log(res_model.prob((file_words[i-1],),file_words[i])) comp_res = (float(-sum_res))*(1/float(text_len)) sum_com = sum_com + math.log(com_model.prob((file_words[i-1],),file_words[i])) comp_com = (float(-sum_com))*(1/float(text_len)) out_str = out_str + ' finprob:'+str(round(sum_fin,2))+' hlprob:'+str(round(sum_health,2))+' resprob:'\ +str(round(sum_res,2))+ ' coprob:' + str(round(sum_com,2)) + ' finper:' + str(round(comp_fin,2)) + ' hlper:'+\ str(round(comp_health,2))+ ' resper:' + str(round(comp_res,2)) + ' coper:' + str(round(comp_com,2)) out_file.write(out_str + '\n') out_file.flush()
def occStats(self,path,format,list,plotting): wordlists = PlaintextCorpusReader(path,format) fileids = wordlists.fileids() k = len(fileids) # computing frequencies self.fileStats(path,fileids) # save stats statsname = "Base GQs (disjoint patterns)" savpath = plotting +'/'+ statsname.replace(' ', '-') # generating report SaveStats(self.classstats,self.stats,"",savpath,plotting) # all
def occurrence_mot_i_corpus(mot,corpus): compteur = 0 for dirs in os.walk(corpus): corpus_root = dirs[0] #parcour l'arborescence du chemin if corpus_root != corpus: textlist = PlaintextCorpusReader(corpus_root,'.*') for files in textlist.fileids(): test= corpus_root + '/' + files x = open(test,'r') for ligne in x: if ligne.find(mot)>0: compteur+=1 x.close() return compteur
def carga_mongodb(): client = pymongo.MongoClient(MONGODB_URI) db = client.docs docs=db.DOCS spanish_stops = set(stopwords.words('spanish')) newcorpus = PlaintextCorpusReader(corpus_root, '.*') newcorpus.fileids() for fileid in newcorpus.fileids(): try: num_words = len(newcorpus.words(fileid)) words = newcorpus.words(fileid) # num_sents = len(newcorpus.sents(fileid)) # print(newcorpus.raw(fileid)) #bcf = BigramCollocationFinder.from_words(words) #filter_stops = lambda w: len(w) < 3 or w in spanish_stops #bcf.apply_word_filter(filter_stops) tags_array=vocab_words(newcorpus.raw(fileid)) tags=tags_array[0] tags_vocab=tags_array[1] cloud=tags_array[2] total_cloud=[] for c in cloud: reg={} reg['word']=c[0] reg['total']=c[1] total_cloud.append(reg) #insertamos el documento post = {"nombre": fileid, "fecha": datetime.datetime.utcnow(), "texto":preparar_texto(newcorpus.raw(fileid)), "tags_vocab":tags_vocab, "tags":tags, "enc":random.randint(1, 50), "pos":random.randint(1, 10), "neg":random.randint(1, 5), "num_words":num_words, "cloud":total_cloud} post_id = docs.insert_one(post).inserted_id except: print("Importacion Fallida:" + fileid)
dl_feature_types[i]=dl_line_words[1] dl_feature_values[i]=dl_line_words[2] # خواندن مقادیر فیچرها از فايل متني ليست تصميم و نگهداري آنها در يک ليست جداگانه براي مقايسه dl_tag_values[i]=dl_line_words[3] # خواندن مقادیر برچسبها از فايل متني ليست تصميم و نگهداري آنها در يک ليست جداگانه براي مقايسه fp_train_sorted_decision_list.close() # خواندن اطلاعات از فايل متني تست ############################################# from nltk.corpus import PlaintextCorpusReader corpus_root = '/' corpus_root = 'WSD/shir_4 folds_texts/shir_test/' # یا هر نسخه دیگر پایتون که بر روی دستگاه نصب شده) قرارگيرد) python34 که شامل زیرفولدرهاي ذکر شده است بايد حتما در دايرکتوري WSD فولدر peykare = PlaintextCorpusReader(corpus_root, '.*') f=peykare.fileids() l=len(f) fp=peykare.open(f[3]) peykare_lines=fp.read().split('\n') peykare_lines_count=len(peykare_lines) # تعداد خطوط پيکره fp.seek(0) test_given_lines_tags=['' for i in range(peykare_lines_count)] for i in range(peykare_lines_count): line_str=fp.readline() line_words=line_str.split() test_given_lines_tags[i]=line_words[-1] # test_given_lines_tags ذخیرهکردن برچسب خطوط فايل در بردار
from nltk.corpus import PlaintextCorpusReader corpus_root = './twitter_data' wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt') id = wordlists.fileids()[0] print(wordlists.words(id))
def load_corpus(): print(gutenberg.root) from nltk.corpus import PlaintextCorpusReader corpus_root = '/path/of/corpus' wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids()
corpus = PlaintextCorpusReader(corpus_root, exp_archivos) if dataset == 4: #tweets corpus_root = '/home/mguevara/datasets/tweet/' export_indices = '/home/mguevara/datasets/info/indices/tweet/' export_matrices = '/home/mguevara/datasets/info/matrices/tweet/' export_vocabularios = '/home/mguevara/datasets/info/vocabularios/tweet/' exp_archivos = '.*' termino_ejemplo = 'jaja' #'articulo857.txt' documento_ejemplo = '12Jul-3.json.txt' print_titulo("CREAR CORPUS") from nltk.corpus import PlaintextCorpusReader corpus = PlaintextCorpusReader(corpus_root, exp_archivos) if len(corpus.words()) < 100: print corpus.fileids() print corpus.fileids() print "\n\t\t\t\t...Corpus creado" a_stopwords = 0 a_porter = 0 a_lema = 0 a_alpha = 1 a_lower = 0 verbose = 0 #para mostrar mensajes en la funcion MiraVocab #corpus_root = '/home/mguevara/datasets' #corpus = PlaintextCorpusReader(corpus_root, 'reuters/.*') #print inverted_index['SELZ'] #termino_ejemplo = 'machine'
#1 Overview of using collocations bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(pairs) f2 = nltk.collocations.BigramCollocationFinder.from_words(pairs) for i in range(2, 6): f2.apply_freq_filter(i) scored = finder.score_ngrams(bigram_measures.raw_freq) word_fd = nltk.FreqDist(tokens) bigram_fd = nltk.FreqDist(nltk.bigrams(tokens)) finder = BigramCollocationFinder(word_fd, bigram_fd) #need to be examined which one is better #print sorted(finder.nbest(trigram_measures.raw_freq, 2)) print sorted(finder.nbest(trigram_measures.raw_freq, 12)) return sorted(finder.nbest(trigram_measures.raw_freq, 15)) #Texas folder collocation start corpus_root = "Islip13Rain" wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists.fileids() ClassEvent = nltk.Text(wordlists.words()) artcle, nonArtcl = 0, 0 art, nonArt = [], [] extractTxt = ' ' topWords = extractTopWrds(ClassEvent) phrases = extractPhrases(ClassEvent) print topWords print phrases
table3['fecha']=table3['fecha'].str.upper() table3['fecha_format']=table3['fecha'].apply(format_date) table3=table3[['key','clase_doc','Description','fecha','fecha_format']] if table1.empty: new_row = {'Description':'Not found', 'clase_doc':'Not Found'} table1 = table1.append(new_row, ignore_index=True) table1=table1[['Description','clase_doc']] return(table3) # Ejecucion principal # charge corpus txt corpus = PlaintextCorpusReader(path_input_txt, file_ids_text) ids = corpus.fileids() # charge corpus ocr corpus_ocr = PlaintextCorpusReader(path_input_ocr, file_ids_ocr) ids_ocr = corpus_ocr.fileids() # black_lists expansion with more posibilities black_list_magistrado=list_concordances_ltc(black_list_magistrado) black_list_opositor=list_concordances_ltc(black_list_opositor) black_list_area=list_concordances_ltc(black_list_area) ######################## ### RDS connection ### ######################## engine = create_engine(t96.sqlConnString) engine.execute("insert into tt_log_transaccion (operacion, comentario) values ('Extracion opositor','Inicia proceso')")
from __future__ import division import nltk, re, pprint from urllib import urlopen from nltk.corpus import PlaintextCorpusReader, stopwords from HTMLParser import HTMLParser from nltk.tokenize import * import shutil, os from bs4 import BeautifulSoup corpus_root = 'C:\Users\Brent\Documents\My Research\Supreme Court Justices\Output Documentation\\1959\\yo\\' wordlists = PlaintextCorpusReader(corpus_root, '.*') fileIds = wordlists.fileids() for i in range(1959, 2014): for name in fileIds: print name if str(i) in name: os.rename( 'C:\Users\Brent\Documents\My Research\Supreme Court Justices\Output Documentation\\1959\\yo\\' + name, 'C:\Users\Brent\Documents\My Research\Supreme Court Justices\Output Documentation\\1959' )
# Reading the e960401 file for basic manipulation import nltk from nltk.corpus import PlaintextCorpusReader corpus_root = '../../../Corpus' excelsior = PlaintextCorpusReader(corpus_root, '.*\.txt') print("Available articles ", excelsior.fileids()) article_name = 'e960401.txt' article = excelsior.words(fileids=article_name) article_lower = [w.lower() for w in article] print(article_name, " has ", len(article_lower), " tokens.") vocabulary = sorted(set(article_lower)) print(vocabulary) print(article_name, " has a vocabulary length of ", len(vocabulary), ".") text = nltk.Text(article_lower) # text.concordance('empresa') bag = [] for cl in text.concordance_list('empresa'): left = list(cl[0][-4:]) right = list(cl[2][-4:]) bag += left bag += right print("The bag of words of 'empresa' is: ", bag)
from nltk.text import Text from nltk.probability import FreqDist from nltk.corpus import PlaintextCorpusReader from nltk.corpus import stopwords import string corpus_root = "abstracts" wordlists = PlaintextCorpusReader(corpus_root, '.*') all_words_list = [] for fid in wordlists.fileids(): try: all_words_list += list(wordlists.words(fid)) except Exception as e: print e fd = FreqDist(Text([w.lower() for w in all_words_list])) vocabulary = fd.keys() clean_vocabulary = [ v for v in vocabulary if v not in stopwords.words("english") and v not in string.punctuation ] print clean_vocabulary[:50] # TODOs: # 1. Take care of non-meaningful words, like "1", ").", etc. # 2. In the whole vocabulary, there are some words like "\x00", why?
'to', 'for', 'it', 'in', 'on' ] def create_word_features(clist): useful_words = [word for word in clist if word not in long_stop_list] my_dict = dict([(word, True) for word in useful_words]) return my_dict corpus_root = 'C:/Users/Bindu/Desktop/samp1/neg' wordlists = PlaintextCorpusReader(corpus_root, '.*') neg_re = [] mylist = [] x = [] for fileids in wordlists.fileids(): words = wordlists.words(fileids) neg_re.append((create_word_features(words), "Depressed")) corpus_root = 'C:/Users/Bindu/Desktop/samp1/pos' wordlists = PlaintextCorpusReader(corpus_root, '.*') pos_re = [] mylist = [] x = [] for fileids in wordlists.fileids(): words = wordlists.words(fileids) pos_re.append((create_word_features(words), "Not depressed")) train_set = neg_re[:45] + pos_re[:45] test_set = neg_re[45:] + pos_re[45:] print(len(train_set), len(test_set))
for i in range(20): #loop 10 times fd.tabulate(counter, counter+10) raw_input("Hit Enter") counter += 10 ############################################################# #or we might want to create an nltk corpus ############################################################# from nltk.corpus import PlaintextCorpusReader #http://nltk.googlecode.com/svn/trunk/doc/api/nltk.corpus.reader.plaintext.PlaintextCorpusReader-class.html summaries_root = "/Volumes/Optibay-1TB/Python/scrapingCode/billSummaries/" #say where your text file is billsCorpora = PlaintextCorpusReader(summaries_root, r'h[1-9].*\.txt') #read it into the corpora dir(billsCorpora) # see what methods can be used on the corpora rawBills = billsCorpora.raw() #all the rawtext billsCorpora.fileids() #show the files in the corpora billsCorpora.fileids()[0] #show the first fileid billsCorpora.raw()[1:30] len(billsCorpora.fileids()) #show how many files len(billsCorpora.sents()) #billsCorpora.fileids('h1.txt') billsCorpora.raw() #the whole corpora billsCorpora.sents() #all of the sentences in the corpora billsCorpora.sents()[1] billsCorpora.words() #all of the words in all of the corpora billsCorpora.words('h1.txt') #all of the words in one corposa twoBillsWords = billsCorpora.words(['h1.txt', 'h1447.txt']) twoBillsWords[500:525] #not that puncutation is included and bills have not been lemmatized billWords = billsCorpora.words()
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.decomposition import PCA, TruncatedSVD #from sklearn.pipeline import make_pipeline import numpy as np import matplotlib.pyplot as plt import re import pandas as pd print("Creating corpus") corpus_root = './docs/' #directory path for txt files #ls -l ./docs | wc -l #to get number of files in dir newCorpus = PlaintextCorpusReader(corpus_root, '[a-zA-Z0-9_\-]+.txt') files = newCorpus.fileids() print('Extracting text from all docs..') docs = [] #empty list, append from corpus row = [ ] #row index to grab hand coded labels and filer names from external csv file for f in files: docs.append(newCorpus.raw(fileids=f)) row.append(int(re.sub('(file)|(-[1-9].txt)', '', f))) #document numbers ## vectorize the words (i.e. count and transform in one step): unigrams and bigrams print('Creating term document matrix with TfIdf vectorizer..') vectorizer = TfidfVectorizer(min_df=0.15, max_df=.5, stop_words='english',
categories_10) # print corpus_test_OVA fin = time.clock() print '---Duree :', fin - deb, 'secondes' print 'le nombre de documents a tester est le suivant:', len( corpus_test_OVA) cmpt_correct = 0 print '---debut du processus de test---' deb = time.clock() for dirs in os.walk(folder_path_test): corpus_root = dirs[0] #parcour l'arborescence du chemin if corpus_root != folder_path_test and os.path.basename( corpus_root) in categories_10: classe_doc = os.path.basename(corpus_root) textlist = PlaintextCorpusReader(corpus_root, '.*') for files in textlist.fileids()[:1]: test = corpus_root + '/' + files # for (fs,l) in corpus_test_OVA: # print '---classification---' dict_result_classif = classify_one_doc( test, dict_models, categories_10) classe_predite = OVA(dict_result_classif, categories_10) print 'la classe predite pour le document est la suivante:', classe_predite print 'label attendu:', classe_doc if classe_predite == classe_doc: correct = 'true' else: correct = 'false' print 'verification du label predit, true sil correspond et false si non:', correct if correct == 'true':
if a + 1 < len(sys.argv) and not sys.argv[a + 1].startswith("-"): corpus_root = sys.argv[a + 1] i = i + 1 a = a + i print "Corpus is", corpus_root print 'Prob. of n-grams depend on history:', CG_REPR print 'Good-Turing:', GT_SMOOTH print 'Kneser-Ney:', KN_SMOOTH start = time.time() # Load corpus corpus = PlaintextCorpusReader(corpus_root, '.*txt', encoding='UTF-8') n_texts = len(corpus.fileids()) # Matrix containing features; a list for each text feature_matrix = [[] for i in range(n_texts)] #print feature_matrix # List of classes, one for each text in corpus text_classes = fextract_helper.find_classes(corpus.fileids()) if char_ngrams: s1 = time.time() a, t = fextract_helper.char_ngram_stats(corpus.fileids(), corpus, \ char_ngram_size, CG_REPR or KN_SMOOTH) e1 = time.time() print 'Char: Finding took', e1 - s1, 'seconds'
import nltk, re, pprint, string from gensim import corpora, models, similarities import codecs from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords, gutenberg, PlaintextCorpusReader gutenberg_dir = nltk.data.find('corpora/gutenberg.zip').join('gutenberg/') root = '/usr/local/share/nltk_data/corpora/gutenberg/' reader = PlaintextCorpusReader( gutenberg_dir, '.*\emma.txt') # doctest: +SKIP #actual regexp should read '.*\.txt' documents = [] for f in reader.fileids(): for sent in reader.sents(f): documents.append(sent) #print(sent) print(documents)[1:100] stoplist = stopwords.words('english') punctuation = string.punctuation + "''" print(punctuation) texts = [[ word.lower() for word in document if (word not in stoplist and not re.search("\W.*", word)) ] for document in documents] print(texts[1:100])
import nltk from nltk.corpus import PlaintextCorpusReader corpus_root = 'E:\\NLP\\MyFiles' wordlists = PlaintextCorpusReader(corpus_root, '.*') list = wordlists.fileids() print(list) list2 = wordlists.words('kc.txt') print(list2)
from textrank.keyword_extraction import extract_keywords from nltk.corpus import PlaintextCorpusReader import os.path import random path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "citeulike180", "documents") print("Parsing corpus from path {0}".format(path)) corpus = PlaintextCorpusReader(path, ".*", encoding="latin-1") files = corpus.fileids() print("Found {0} files within corpus.".format(len(files))) fileid = random.randint(0, len(files)) file = files[fileid] print("Randomly selected file {0} for processing.".format(file)) print("Extracting keywords...") print(extract_keywords(corpus.raw(file)))
""" # Criação de Corpus com Python import matplotlib.pyplot as plt import nltk # Já vem no anaconda # nltk.download() # abre uma janela de download de todos os pacotes adicionais # Criando o Corpus: from nltk.corpus import PlaintextCorpusReader corpus = PlaintextCorpusReader('Dados', '.*') # .* = todas as extensões arquivos = corpus.fileids() # Cria uma lista com todos os arquivos da pasta # Observando os arquivos: arquivos[0] arquivos[0:100] for a in arquivos: print(a) # Vendo o texto de um arquivo: texto = corpus.raw('1.txt') # Acessar o texto de todo o corpus: todo_texto = corpus.raw() # Acessar cada uma das palavras que existem no corpus: palavras = corpus.words()
import random import os import shutil from nltk.corpus import PlaintextCorpusReader corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/almedad/all_known" #corpus_root = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/ansar1/all" output_dir = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/almedad/al3" #output_dir = "/Users/epb/Documents/uni/kandidat/speciale/data/dw/ansar1/an9" N_TEXTS = 3000 B = 50 corpus = PlaintextCorpusReader(corpus_root, '.*txt', encoding="UTF-8") n = len(corpus.fileids()) #start = random.randint(0,n) #print 'start', start count = 0 #next = start picked = [] b = 0 next = random.randint(0, n - B) while count < N_TEXTS: b = 0 while b < B:
from nltk.corpus import PlaintextCorpusReader corpus_root = "./SH" my_corpus = PlaintextCorpusReader(corpus_root, '[^__].*txt') print(my_corpus.fileids()) #my_corpus.words('hound_of_baskerville.txt')[10:20] #sentOut = my_corpus.sents('hound_of_baskerville.txt')[10] #print(sentOut) from nltk.text import Text hound = Text(my_corpus.words('hound_of_baskerville.txt')) #hound.concordance("Watson") #hound.similar("hound") #hound.collocations() #from nltk.probability import FreqDist #my_fdist = FreqDist(hound) #top_100 = my_fdist.most_common(100) #print(top_100[50:99]) #my_fdist.hapaxes() #hound.dispersion_plot(["Holmes","Watson"]) #hound.dispersion_plot(["Stapleton", "Henry", "Barrymore"]) #ne_chunk: name chunk ; pos_tag: part of speech ; word_tokenize: chunking import nltk from nltk import ne_chunk, pos_tag, word_tokenize from nltk.tree import Tree from collections import Counter def get_characters(words_param): chunked = ne_chunk(pos_tag(words_param)) prev = None continuous_chunk = []
from nltk import word_tokenize from nltk.tag import pos_tag V = ['VB', 'VBZ', 'VBP', 'VBD', 'VBG'] N = ['NN', 'NNS', 'NNP', 'NNPS'] ADV = ['RB', 'RBR', 'RBS'] ADJ = ['JJ', 'JJR', 'JJS'] wLen = [] # number of words vLen = [] # number of verbs advLen = [] # number of adverbs adjLen = [] # number of adjectives vLen, nLen, advLen, adjLen, wLen = ([] for i in range(5)) for fileid in newcorpus.fileids(): tokens = word_tokenize(newcorpus.raw(fileid)) words = [t for t in tokens if t.isalpha()] taggedW = pos_tag(words) verbs, nouns, advs, adjs = ([] for i in range(4)) for (w,tag) in taggedW: if tag in V: verbs.append(w) elif tag in N: nouns.append(w) elif tag in ADV: advs.append(w) elif tag in ADJ: adjs.append(w) wLen.append(len(words)) vLen.append(len(verbs)) nLen.append(len(nouns)) advLen.append(len(advs)) adjLen.append(len(adjs))
return True else: return False ############# Loading, processing, and bigrams for Democrat speeches ########### dem_root = "E:/Documents/IST 664/Final Project/Democrats" # change this line as appropriate dems = PlaintextCorpusReader(dem_root, [ "Arizona.txt", "Connecticut.txt", "Illinois.txt", "Kentucky.txt", "Louisiana.txt", "Maine.txt", "Massachusetts.txt", "Michigan.txt", "New Hampshire.txt", "Vermont.txt" ]) dem_sentences = [] for i in range(10): temp = dems.fileids()[i] temptext = dems.raw(temp) tempsent = sent_tokenize(temptext) dem_sentences = dem_sentences + tempsent Dems = [] for i in range(10): temp = dems.fileids()[i] temptext = dems.raw(temp) tempTokens = nltk.word_tokenize(temptext) Dems = Dems + tempTokens Demwords = [w.lower() for w in Dems] DemAlphaWords = [w for w in Demwords if not alpha_filter(w)] DemStoppedWords = [w for w in DemAlphaWords if not w in stopwords] print("Democrats used",
print(calculate_lines_II(volume)) # In order to calculate the amount of words and sentences in each volume, # We made a new corpus of the volumes using the nltk PlaintextCorpusReader # which has some easy tools that can split a text into a list of words or sentences. corpus_root= 'data' volumes = PlaintextCorpusReader(corpus_root, 'arabian.*') list_of_sentences = volumes.sents() print('The ten volumes consist of ' + str(len(list_of_sentences)) + ' sentences') list_of_words = volumes.words() print('The ten volumes consist of ' + str(len(list_of_words)) + ' words') for item in volumes.fileids(): #calculate the amount of words in each volume print(item,':', len(volumes.words(item)), 'words') for item in volumes.fileids(): #calculate the amount of sentences in each volume print(item,':', len(volumes.sents(item)), 'sentences') ################################################################## #visualisation of the statistiscs with basic plotting techniques ################################################################## # Visualise the characters per volume # and make a list of the total characters per volume. characters_per_volume = (calculate_characters(corpus))[0] x = [1,2,3,4,5,6,7,8,9,10]
def edit_nltk(): corpus_root = r'/data' books = PlaintextCorpusReader(corpus_root, '.*') print(books.fileids())
#Obtener la palabra más frecuente del fichero del corpus palabra_mas_frecuente = fdist.max() print( "\n--------------------------------------------------------------------------------\n " ) print("\n\n1.9) Palabra mas frecuente del fichero del corpus: \n" + str(palabra_mas_frecuente)) # Cargar los ficheros de PoliformaT (“spam.txt”, “quijote.txt” y “tirantloblanc.txt” ) como un corpus propio. from nltk.corpus import PlaintextCorpusReader corpus_root = './ficheros' new_corpus = PlaintextCorpusReader(corpus_root, '.*') lista_ficheros = new_corpus.fileids() print( "\n--------------------------------------------------------------------------------\n " ) print("\n\n1.10) Ficheros que componen el corpus: \n" + str(lista_ficheros) + "\n") print( "\n-------------IMPORTANTE CAMBIAR LA RUTA DE LOS ARCHIVOS EN EL CÓDIGO------------\n " ) print("Ruta actual : " + corpus_root) # Calcular el número de palabras, el número de palabras distintas y el número de frases de los tres documentos print( "\n--------------------------------------------------------------------------------\n1.11) " ) print("\n" + "Palabras".rjust(35, " ") + "Vocabulario".rjust(12, " ") +
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import matplotlib.pyplot as plt import nltk #natural language toolkit #vai abrir uma janela, onde pode fazer o download de todos os pacotes #nltk.download() from nltk.corpus import PlaintextCorpusReader corpus = PlaintextCorpusReader('Arquivos', '.*') #.*: todas as extensões arquivos = corpus.fileids() arquivos[0] arquivos[0:100] for a in arquivos: print(a) texto = corpus.raw('1.txt') #conteúdo todo_texto = corpus.raw() #todos os textos em todos os arquivos palavras = corpus.words( ) #não mostra no variable explorer e mostra apenas '...' no console, pois é um vetor muito grande palavras[1] #visualizar a segunda palavra len(palavras) #contagem de palavras
from nltk.corpus import PlaintextCorpusReader # Link to download plays in txt format of Shakespeare - http://www.textfiles.com/etext/AUTHORS/SHAKESPEARE/ import os corpus_root = os.getcwd() + '/' file_ids = '.*.txt' wordlists = PlaintextCorpusReader(corpus_root, file_ids) print(wordlists.fileids()) print( wordlists.words( r'C:\Users\nathani_n\Desktop\Nlp_Udemy\shakespeare-taming-of-the-shrew.txt' ))
import nltk from nltk.corpus import PlaintextCorpusReader import pickle root = '../Charlotte/' wordlist = PlaintextCorpusReader(root, '.*') wordlist.fileids() charlotte = wordlist.words('Charlotte.txt') # other = wordlist.words('other.txt') # join = charlotte + other with open('nltk_german_classifier_data.pickle', 'rb') as f: tagger = pickle.load(f) charlotte_tagged = tagger.tag(charlotte) with open('tagged_charlotte_data.pickle', 'wb') as f: pickle.dump(charlotte_tagged, f) charlotte_nouns = [] for word in charlotte_tagged: if word[1] == 'NN': charlotte_nouns.append(word[0])
def extract_network(file): #POS tag text text = my_corpus.words(file) chunked = ne_chunk(pos_tag(text)) #Extract list of people current_chunk = [] for i in chunked: if type(i) == Tree: if (i.label() == 'PERSON'): current_chunk.append(" ".join( [token for token, pos in i.leaves()])) #Create ordered list of how many mentions per name agg_list = Counter(current_chunk).most_common() agg_list.insert(0, ('Thinker', 'Frequency')) #Insert headings for csv file #return agg_list #Write the list to a csv file with name in one column and frequency in the next myfile = open( file[:-4] + '_nechunk.csv', 'w', newline='' ) #Name csv after the text file, removing .txt and adding .csv with myfile: writer = csv.writer(myfile) writer.writerows(agg_list) print('Writing complete') for book in my_corpus.fileids(): extract_network(book)
'service_holiday_inn_london.txt.data', 'service_swissotel_hotel_chicago.txt.data', 'staff_bestwestern_hotel_sfo.txt.data', 'staff_swissotel_chicago.txt.data' ]) wsj = PlaintextCorpusReader(corpus_root, [ 'accuracy_garmin_nuvi_255W_gps.txt.data', 'directions_garmin_nuvi_255W_gps.txt.data', 'display_garmin_nuvi_255W_gps.txt.data', 'satellite_garmin_nuvi_255W_gps.txt.data', 'screen_garmin_nuvi_255W_gps.txt.data', 'speed_garmin_nuvi_255W_gps.txt.data', 'updates_garmin_nuvi_255W_gps.txt.data', 'voice_garmin_nuvi_255W_gps.txt.data' ]) print wordlists.fileids() print wsj.fileids() print(len(wordlists.sents())) senLengths1 = [len(s) for s in wordlists.sents()] freqDist1 = nltk.FreqDist(senLengths1) print(len(wsj.sents())) senLengths2 = [len(s) for s in wsj.sents()] freqDist2 = nltk.FreqDist(senLengths2) propDist1 = nltk.DictionaryProbDist(freqDist1, normalize=True) propDist2 = nltk.DictionaryProbDist(freqDist2, normalize=True) myfile = open('../Thesis/wsjdist2.dat', 'wb') wr = csv.writer(myfile, quoting=csv.QUOTE_NONE)