def parseFolder( dirPath ): assignments = {} draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*') finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*') numFiles = len( os.listdir( dirPath )) assert numFiles % 2 == 0 finalIdsSortedList = finalReader.fileids() draftIdsSortedList = draftReader.fileids() for pid in finalReader.fileids(): final = finalReader.paras( pid ) #finalIdsSortedList[i] ) draft = draftReader.paras( pid ) #draftIdsSortedList[i] ) assn = assignment( draft, final ) assignments[pid] = assn return assignments
def parseFolder( dirPath ): assignments = [] draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*') finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*') numFiles = len( os.listdir( dirPath )) assert numFiles % 2 == 0 finalIdsSortedList = finalReader.fileids() draftIdsSortedList = draftReader.fileids() for i in range(len(finalReader.fileids())): final = finalReader.paras( finalIdsSortedList[i] ) draft = draftReader.paras( draftIdsSortedList[i] ) assn = assignment( draft, final ) assignments.append( assn ) return assignments
def token_in_coverage(self): corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252') resumes = [[item for sent in paras for item in sent] for paras in corpa.paras()] cpt=0 for resume in resumes : resume_text = " ".join(resume) resume_sents = nltk.sent_tokenize(resume_text) resume_words = set(token.lemma_ for sent in resume_sents for token in nlp(" ".join(sent).lower())) if not resume_words.isdisjoint(self.tokens_in) : cpt+=1 coverage = cpt*1.0/len(resumes) print("token_in coverage : {}".format(coverage))
def processFile(newCorpusDir): if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf') txt3 = word.getTextWord('my_doc.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
def Read_corpus(path_c, fname_c, fo1): import nltk import re import spacy import en_core_web_sm import fileinput nlp = spacy.load('en_core_web_sm') from nltk.corpus.reader.plaintext import PlaintextCorpusReader pcorpus = PlaintextCorpusReader(path_c, fname_c, encoding="utf") #HTML Tags to file fappend(fo1, P_htmltag.writehtmltag1(fname_c), fname_c) # Iterate through each paragraph for para in pcorpus.paras(): L0 = rep_tags(para) L1 = L0.split("\n") for i, w in enumerate(L1): if (w != ""): ApplyNLP(nlp(str(w[1:])), fo1) fappend(fo1, P_htmltag.writehtmltag3(fname_c), fname_c)
class Contract_Reader(): def __init__(self, config): print('Filepath for texts = ', config.textpath) self.corpus = PCR(config.textpath, '.*\.txt', encoding='utf-16', para_block_reader=read_line_block) if config.clean_paragraphs == 'yes': self.clean(config, mode='para') if config.clean_sentences == 'yes': self.clean(config, mode='sent') #Corpus summaries self.corpus_info() self.LDA(config.num_topics, config.num_words) self.plot(config.num_words) def clean(self, config, mode='sent'): stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WNL() if mode == 'para': #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings. self.para_list = [ list(itertools.chain.from_iterable(para)) for para in self.corpus.paras() ] for index, paragraph in enumerate(self.para_list): paragraph = " ".join(paragraph) stop_free = " ".join( [i for i in paragraph.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.para_list[index] = normalized print(self.para_list[0]) self.para_list = [para.split() for para in self.para_list] print(self.para_list[0]) if mode == 'sent': #Obtain list of strings each one a sentence rather than list of lists. self.sents_list = [" ".join(sent) for sent in self.corpus.sents()] for index, sentence in enumerate(self.sents_list): stop_free = " ".join( [i for i in sentence.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.sents_list[index] = normalized print(self.sents_list[0]) self.sents_list = [ sentence.split() for sentence in self.sents_list ] print(self.sents_list[0]) def LDA(self, num_topics, num_words): dictionary = corpora.Dictionary(self.para_list) doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list] path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623' self.ldamodel = LdaVowpalWabbit(path, doc_term_matrix, num_topics=num_topics, id2word=dictionary) self.ldamodel.save('model/lda_model') print(self.ldamodel.print_topics(num_topics=10, num_words=num_words)) def plot(self, num_words): for t in range(self.ldamodel.num_topics): plt.figure() tuples = [ reversed(x) for x in self.ldamodel.show_topic(t, num_words) ] plt.imshow(WordCloud().fit_words(dict(tuples))) plt.axis("off") plt.title("Topic #" + str(t)) plt.savefig('plots/topic' + str(t)) def corpus_info(self): """ Summary information about the status of a corpus. """ fids = len(self.corpus.fileids()) paras = len(self.corpus.paras()) sents = len(self.corpus.sents()) sperp = sum(len(para) for para in self.corpus.paras()) / float(paras) tokens = FreqDist(self.corpus.words()) count = sum(tokens.values()) vocab = len(tokens) lexdiv = float(count) / float(vocab) print( ("Text corpus contains {} files\n" "Composed of {} paragraphs and {} sentences.\n" "{:0.3f} sentences per paragraph\n" "Word count of {} with a vocabulary of {}\n" "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp, count, vocab, lexdiv))
import os import word, pdf from nltk.corpus.reader.plaintext import PlaintextCorpusReader def getText(txtFileName): file = open(txtFileName, 'r') return file.read() newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
with newcorpus.open(infile) as fin: # Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0]) # Access sentences in the corpus. (list of list of strings) # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print # To access sentences of a specific fileid. print newcorpus.sents(newcorpus.fileids()[0]) # Access just tokens/words in the corpus. (list of strings) print newcorpus.words()
class TextAnalizer: def __init__(self, my_input_file): self.config = configparser.ConfigParser() self.config.read("text_analysis.cfg") self.input_file = my_input_file self.nlp_model = self.config["DEFAULT"]["nlp_model"] #The output file name self.output_file = self.config["DEFAULT"]["output_file"] self.nlp = load_nlp(self.nlp_model) self.corpus = CorpusReader(".", self.input_file) self.raw_text = self.corpus.raw() self.nlp_text = self.nlp(self.raw_text) # Here, lets put together the infos for text analysis with spacy. self.analysis_dictionary = Counter() self.word_count = 0 self.get_word_count_nltk() def get_paragraph(self): return self.corpus.paras() def get_sentence(self): return self.corpus.sents() def get_word(self): return self.corpus.words() def get_word_count_nltk(self): tokenizer = Tokenizer(r'\w+') counts = Counter() sentences = self.get_sentence() for sentence in sentences: tokens = tokenizer.tokenize(" ".join(sentence)) self.word_count = self.word_count + len(tokens) filtered = [w for w in sentence if w.isalnum()] counts = counts + Counter(filtered) return counts, self.word_count def analize_nlp(self): analized_data_str = (self.config["ANALIZED"]["POS"]) analized_data = (analized_data_str.split(",")) result_dict = {} diff_str, tot_str = ( self.config["DEFAULT"]["diff_tot_string"]).split(",") lemma_counter = Counter() pos_counter = Counter() tag_counter = Counter() for token in self.nlp_text: lemma_counter = lemma_counter + Counter([token.lemma_]) pos_counter = pos_counter + Counter([token.pos_]) tag_counter = tag_counter + Counter([token.tag_]) my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_ self.analysis_dictionary[my_key] += 1 for pos in analized_data: instance_counter = 0 total_counter = 0 for key in self.analysis_dictionary.keys(): try: my_lemma, my_tag, my_pos = key.split("_") except ValueError: print("Warning: Array has a empty line") # add logging if pos == my_pos: instance_counter += 1 total_counter = total_counter + self.analysis_dictionary.get( key) result_dict[pos + diff_str] = instance_counter result_dict[pos + tot_str] = total_counter #add the stuff from nltk diff_word, word_count = self.get_word_count_nltk() result_dict["WORDS" + tot_str] = word_count result_dict["WORDS" + diff_str] = len(diff_word) result_dict["PARAGRAPHS"] = len(self.get_paragraph()) result_dict["SENTENCES"] = len(self.get_sentence()) return result_dict def write_output(self): with open(self.output_file, "w+") as f: f.write("Number of paragraphes: " + str(len(self.get_paragraph())) + "\n") f.write("Number of sentences: " + str(len(self.get_sentence())) + "\n") f.write("Number of words: " + str(self.word_count) + "\n") f.write("Average words per sentence: " + str(round(self.word_count / len(self.get_sentence()), 2)) + "\n") f.write("Number of different words: " + str(len(self.get_word_count_nltk())) + "\n") f.write("Text variety (different words/total words: " + str( round(len(self.get_word_count_nltk()) / self.word_count, 2)) + "\n") f.close()
return file.read() # 말뭉치 폴더 생성 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): # 말뭉치 폴더가 이미 존재하는가? os.mkdir(newCorpusDir) # 파일 읽기 # 일반 텍스트 파일 txt1 = getText('./Files/sample_feed.txt') # PDF 파일 txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf') # DOCX 파일 txt3 = word.getTextWord('./Files/sample-one-line.docx') # 파일 쓰기 files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) # 사용자 정의 말뭉치 만들기 # 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다 newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') # 사용자 정의 말뭉치가 잘 만들어 졌는지 확인 print(newCorpus.words()) # 말뭉치의 모든 단어를 포함하는 배열 print(newCorpus.sents(newCorpus.fileids()[1])) # 1.txt에 있는 모든 문장 배열을 출력 print(newCorpus.paras(newCorpus.fileids()[0])) # 0.txt에 있는 모든 단락 배열을 출력
#Filname of text file to analyze input_file = config["DEFAULT"]["input_file"] #The nlp model used nlp_model = config["DEFAULT"]["nlp_model"] #The output file name output_file = config["DEFAULT"]["output_file"] #Load the nlp model nlp = load_nlp(nlp_model) #This Section generates a corpus (for nltk) and a string-text (for spacy) corpus = CorpusReader(".", input_file) my_text = corpus.raw() #This section deals with nltk-stuff for analysis paragraphs = corpus.paras() sentences = corpus.sents() words = corpus.words() tokenizer = Tokenizer(r'\w+') word_count = 0 counts = Counter() for sentence in sentences: tokens = tokenizer.tokenize(" ".join(sentence)) word_count = word_count + len(tokens) filtered = [w for w in sentence if w.isalnum()] counts = counts + Counter(filtered) nlp_text = nlp(my_text)
def try_out_some_functionalities(): corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \ "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/" newcorpus = PCR(corpusdir, '.*') print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access one file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" infile = corpusdir + "0001.1999-12-10.farmer.ham.txt" infile = "0004.1999-12-14.farmer.ham.txt" fin = newcorpus.open(infile) print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "all file ids" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.fileids() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access each file in the corpus" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # (reduced output: [0:2]) for infile in sorted(newcorpus.fileids()): # the fileids of each file print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print infile # opens the file fin = newcorpus.open(infile) # prints the content of the file print fin.read().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "access the plaintext; outputs pure string of all files" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.raw().strip() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access paragraphs in the corpus. (list of list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access pargraphs of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.paras(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access sentences in the corpus. (list of list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access sentences of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.sents(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "Access just tokens/words in the corpus. (list of strings)" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words() print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print "To access tokens of a specific fileid." print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print newcorpus.words(newcorpus.fileids()[0]) print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
fin = newcorpus.open(infile)# Opens the file. print fin.read().strip() # Prints the content of the file print # Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0]) # Access sentences in the corpus. (list of list of strings) # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print # To access sentences of a specific fileid. print newcorpus.sents(newcorpus.fileids()[0]) # Access just tokens/words in the corpus. (list of strings) print newcorpus.words()
def build_d2v_model(self): print("Début de la construction du modèle Doc2Vec") corpusdir = 'data/cv_corpus' corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252') print("tokenizing...") resumes = [[token.lemma_ for sent in paras for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for paras in corpa.paras()] #print(resumes[0:3]) print("tokenization completed") documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(resumes)] model = Doc2Vec(documents, vector_size=self.cv_length, window=5, min_count=1, workers=4) print("Fin de la construction du modèle Doc2Vec") return model