def get_phrase(): root_dir = r'E:\github_repo\python_basic\pythonbasictest\self_nltk\files' wordlists = PlaintextCorpusReader(root_dir,".*") x = nltk.Text(wordlists.words("test.txt")) print(x) print(x.collocations())
def get_corpus_words(): ''' Returns all the words from corpus. ''' reader = PlaintextCorpusReader(settings.CORPUS_ROOT, settings.CORPUS_FILES_GLOBB) if reader: return reader.words() return []
def load_corpus(self): if len(self.corpus) == 0: raise Exception('No corpus defined.') if os.path.isdir(self.corpusdir) is False: self.generate_corpus_files() newcorpus = PlaintextCorpusReader(self.corpusdir, '.*') # bard.sents = newcorpus.sents bard.tokens = newcorpus.words() print len(bard.tokens) # print 'init markov NLG text generator' self.generator = bard.generators.markov.IntelligentMarkovGenerator(bard.tokens)
def processFile(newCorpusDir): if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf') txt3 = word.getTextWord('my_doc.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
def _strip_tags(self, title): new_title = '' custom_corpus = PlaintextCorpusReader('../custom_corpora/', '.*') #For each word in the title for word in title.split(): #Remove all punctuation noPunc = ''.join(c for c in word if c not in string.punctuation) #If this word isn't in stopwords and isn't just a single letter if noPunc.lower() not in (stopwords.words('english')) and len(noPunc) > 1: stripped_word = self._strip_word(word) if stripped_word not in (custom_corpus.words('media')) and len(stripped_word) > 1: new_title = ' '.join([new_title, stripped_word]) return new_title[1:]
def corpus_reader(filepath): """ takes a filepath including filename formats in case file is csv loads file into PlainTextCorpusReader """ print "TEST: corpus_reader call" csv_file = open(filepath, 'rb') # use test_1.csv as test case csv_data = csv.reader(csv_file) global csv_read csv_read = open('uploads/tmp/read.tmp', 'w') for line in csv_data: line_to_write = re.sub('[\s\t]+', ' ', str(line)) line_to_write = line_to_write.lstrip('[\'') line_to_write = line_to_write.rstrip('\']') csv_read.write(str(line_to_write) + "\n\n") root = 'uploads/' corpus = PlaintextCorpusReader(root, 'tmp/read.tmp') #response = corpus.paras() words = corpus.words() return words
import os import nltk import pickle import zlib import base64 from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify import PositiveNaiveBayesClassifier from nltk.corpus.reader.plaintext import PlaintextCorpusReader corpusdir = "./text" newcorpus = PlaintextCorpusReader(corpusdir, ".*") labeled_names = ( [(name, "comp") for name in newcorpus.words("comp.txt")] + [(name, "animal") for name in newcorpus.words("animal.txt")] + [(word, "ignore") for word in newcorpus.words("ignorethese.txt")] ) features = [({n: n}, thing) for (n, thing) in labeled_names] training = features[:] testing = "What color is the mouse?".lower().split(" ") classifier = NaiveBayesClassifier.train(training) pickleclf = pickle.dumps(classifier) compressed = base64.b64encode(zlib.compress(pickleclf, 9)) with open("PickledClassifier.txt", "wb") as outobj: outobj.write(compressed) compScore = 0 animalScore = 0 for word in testing: if ( word[len(word) - 1] == "." or word[len(word) - 1] == "," or word[len(word) - 1] == "?"
return file.read() # 말뭉치 폴더 생성 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): # 말뭉치 폴더가 이미 존재하는가? os.mkdir(newCorpusDir) # 파일 읽기 # 일반 텍스트 파일 txt1 = getText('./Files/sample_feed.txt') # PDF 파일 txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf') # DOCX 파일 txt3 = word.getTextWord('./Files/sample-one-line.docx') # 파일 쓰기 files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) # 사용자 정의 말뭉치 만들기 # 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다 newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') # 사용자 정의 말뭉치가 잘 만들어 졌는지 확인 print(newCorpus.words()) # 말뭉치의 모든 단어를 포함하는 배열 print(newCorpus.sents(newCorpus.fileids()[1])) # 1.txt에 있는 모든 문장 배열을 출력 print(newCorpus.paras(newCorpus.fileids()[0])) # 0.txt에 있는 모든 단락 배열을 출력
import nltk from nltk.corpus import stopwords import numpy as np import matplotlib.pyplot as plt from collections import Counter from nltk.corpus.reader.plaintext import PlaintextCorpusReader stop_words = set(stopwords.words('english')) # not interested in stop words stop_words.update(['.', ',', "',", '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '-']) # ... or punctuation corpusdir = 'lyrics/' # Directory of corpus. oLedZeppelinCorpus = PlaintextCorpusReader(corpusdir, '.*') lNoStopWords = [] for sWord in oLedZeppelinCorpus.words(): if sWord not in stop_words: lNoStopWords.append(sWord) lNoStopWordsLength = len(lNoStopWords) wordCounts = Counter(lNoStopWords) wordCountsLower = Counter(i.lower() for i in lNoStopWords) # top 25 lCountLabels, lCountValues = zip(*wordCountsLower.most_common(50)[0:25]) lCountIndexes = np.arange(len(lCountLabels)) iCountWidth = 1 barlist = plt.bar(lCountIndexes, lCountValues) for i in range(0, len(barlist)): # all bars to black barlist[i].set_color('black') plt.xticks(lCountIndexes, lCountLabels)
focal_word = sys.argv[1] senses = [sys.argv[2], sys.argv[3]] #focal_word = "plant" #senses = ["manufacturing","life"] corpus = PlaintextCorpusReader('outcorpus/', '.*') collocations = [ wsd.BigramLeft(senses, 0), wsd.BigramRight(senses, 1), wsd.BigramScope(senses, 2, [2, 10]) ] decision_list = wsd.DecisionList() decision_list.load("senses_bootstrap_" + focal_word + ".csv") i = 0 for infile in sorted(corpus.fileids()): print i, "/", len(corpus.fileids()) i += 1 words = corpus.words(infile) text = Text(words) c = nltk.ConcordanceIndex(text.tokens) offsets = c.offsets(focal_word) for offset in offsets: for collocation in collocations: tokens = collocation.get_collocation(text, offset) if tokens == None: continue sense = decision_list.get_sense(tokens, collocation.index) if sense == None: continue collocation.add_collocation(text, offset, sense) collocation.update_decision_list(decision_list) #decision_list.add_sense(sense, tokens, collocation.index, score) print sense
arxiu_diccionari = codecs.open("diccionari2-cat.txt", "r", encoding="utf-8") for entrada in arxiu_diccionari: entrada = entrada.rstrip() camps = entrada.split(" ") forma = camps[0] lema = camps[1] etiqueta = camps[2] if forma in diccionari: diccionari[forma] = diccionari.get(forma, "") + " " + lema + " " + etiqueta else: diccionari[forma] = lema + " " + etiqueta segmentador = nltk.data.load("catalan-mod.pickle") tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'noticia.txt', word_tokenizer=tokenitzador, sent_tokenizer=segmentador) for forma in corpus.words(): if forma in diccionari: info = diccionari[forma] elif forma.lower() in diccionari: info = diccionari[forma.lower()] else: info = "DESCONEGUDA" print(forma + " " + info)
import random import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import bigrams, trigrams from nltk.tokenize import sent_tokenize, word_tokenize from collections import Counter, defaultdict #create a folder for your corpus corpusdir = 'miscme/' newcorpus = PlaintextCorpusReader(corpusdir, '.*') #tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') #tokenizer.tokenize(newcorpus.strip()) words = newcorpus.words() sents = newcorpus.sents() words = [w.lower() for w in words] sents = [[w.lower() for w in sent] for sent in sents] trigram_counts = defaultdict(lambda: Counter()) for sentence in sents: for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True): trigram_counts[(w1, w2)][w3] += 1 trigram_probs = defaultdict(lambda: Counter()) for w1_w2 in trigram_counts: total_count = float(sum(trigram_counts[w1_w2].values())) trigram_probs[w1_w2] = Counter({w3: c/total_count for w3,c in trigram_counts[w1_w2].items()}) for i in range(10):
############################################################################### ### ATENTION: if we have some tmp files like .DS_STORE in Mac OSX, we must remove it ### # Reading corpus corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/glossAnnotated/' # Directory of corpus. #corpusdir = '/Users/marceloschiessl/RDF_text_project/corpus/WikiRisk/test/test1/' # Directory of corpus. risco = PlaintextCorpusReader(corpusdir, '.*') risco.fileids() raw_text = risco.raw('gloss533.txt') #print raw_text[0:] # Some statistics print 'Number of term: ', len(risco.words()) print 'Number of unique terms: ', len(set(risco.words())) fd = nltk.FreqDist(risco.words()) print fd.freq('bem') print fd['bem'] # presenting ngrams of the term target_word = 'bem como' fd = nltk.FreqDist(ng for ng in nltk.ngrams(risco.words(), 6) if target_word in ng) for hit in fd: print(' '.join(hit)) txt = nltk.Text(risco.words())
encoding='utf-8') as f: for tweet in tweets: text = tweet.get("full_text") text = re.sub(r"http\S+", "", text) # remove links from corpora f.write(text) create_corpus(tweet_folder) # Create NLTK corpus from txt files corpus_folder = Path("./corpus/") corpus = PlaintextCorpusReader('./corpus/', '.*') print(corpus.words('anime.txt')) # Task a) def filter_corpus(corpus, file=None): ''' Removes english stopwords :param tokens: :return: ''' if file is not None: tokens = corpus.raw(file).split(' ') # Using split to keep hashtags else: tokens = corpus.raw(corpus.fileids()).split(' ') filtered_words = []
def getText(textFileName): # 读取txt文档 file = open(textFileName) return file.read() # 创建文件夹 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) # 读取文件 txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()[:20]) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0])) print(newCorpus.fileids()) print(newCorpus.words(['1.txt', '2.txt'])) word = newCorpus.words(['0.txt']) fDist = nltk.FreqDist(word) print(fDist.most_common(10))
import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer segmentador = nltk.data.load("catalan.pickle") tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'DOGC-2015-cat.txt', word_tokenizer=tokenitzador, sent_tokenizer=segmentador) for paraula in corpus.words(): print(paraula) print("TOTAL PALABRAS:", len(corpus.words()))
import nltk import os from os import listdir from os.path import isfile, join from nltk.collocations import * from nltk.corpus.reader.plaintext import PlaintextCorpusReader bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() # read in corpus, find all the 3-grams above the min frequency print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION) print "Read in " + str(len(my_corpus.fileids())) + " files" print "Finding 3-grams" finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words()) print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY finder_3gram.apply_freq_filter(MIN_FREQUENCY) # combine all the 3-grams meeting the PMI threshold print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI filelist = [ join(CORPUS_ROOT,f) for f in listdir(CORPUS_ROOT) if isfile(join(CORPUS_ROOT,f)) ] gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI) processGrams(gen, filelist) # now let's do the same for the 2-grams # our previous step altered the corpus so let's read it in again print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION) print "Finding 2-grams" finder_2gram = BigramCollocationFinder.from_words(my_corpus.words())
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk import word_tokenize import re corpusdir = 'python/' # Directory of corpus. newcorpus = PlaintextCorpusReader(corpusdir, '.*') print(newcorpus.fileids()[0]) print(type(newcorpus)) #print newcorpus.raw() print newcorpus.words(newcorpus.fileids()[0]) print(len(newcorpus.words())) tokens = word_tokenize(newcorpus.raw()) #type(tokens) print len(tokens) print tokens[:50] #tokens[:10] print newcorpus.sents() print #to remove comments def removeComments(string): string = re.sub(re.compile("/\*.*?\*/",re.DOTALL ) ,"" ,string) # remove all occurance streamed comments (/*COMMENT */) from string fdf string = re.sub(re.compile("//.*?\n" ) ,"" ,string) # remove all occurance singleline comments (//COMMENT\n ) from string return string print(removeComments(newcorpus.words(newcorpus.raw())))
class Misunderstood_artificial_poet: def __init__(self, master): ''' Constructor. master is a string that names a directory in the same repository that contains all the work from inspiration ''' self.master = 'masters/' + master self.reader = PlaintextCorpusReader(self.master, r'.*', encoding='utf-8') self.text = self.reader.words() def generate_model(self, word, num=50): ''' Writes a text based on most probable word to appear after each word. Prone to looping ''' bigrams = nltk.bigrams(self.text) cfdist = nltk.ConditionalFreqDist(bigrams) print(cfdist[word].pformat()) for i in range(num): print(word, end=' ') word = cfdist[word].max() def count_foot(self, word): ''' Counts number of foot in word (doesn't account for jonctions for now) ''' startsWithVowel = False if re.match('[aeiouyàéèùôûâêîïöüäëÿ]', word): startsWithVowel = True splitword = re.split(pattern='[aeiouyàéèùôûâêîïöüäëÿ]', string=word, flags=re.IGNORECASE) cleansplit = [el for el in splitword if el is not ''] if startsWithVowel: cleansplit.append('vowel') return (len(cleansplit)) def check_rhyme(self, language, string, substring): ''' Checks if the end of a string rhymes with a substring (could this be implemented via machine learning ?) ''' pattern = re.sub( '(.*?)([aeiouyàéèùôûâêîïöüäëÿ][zrtpmlkjhgfdsqwxcvbn]*?$)', '\2', string) if language == 'french': pattern = re.sub('', '', pattern) elif language == 'english': pass def text_generator(self, word, num=10): ''' Writes a text based on a random choice of word that appear in collocation in master's work ''' verse = "" bigrams = nltk.bigrams(self.text) cfdist = nltk.ConditionalFreqDist(bigrams) for i in range(num): verse += word + ' ' word_collocates = [] for w in cfdist[word]: word_collocates.append(w) word = random.choice(word_collocates) return verse def rhyme_generator(self, inputWord, rhyme=None, foot=12): ''' Writes a verse based on previous word used ''' verse = "" counted_foot = 0 bigrams = nltk.bigrams(self.text) cfdist = nltk.ConditionalFreqDist(bigrams) continueWriting = True if rhyme is None: rhyme = random.choice([ word for word in self.text if (len(word) > 3 and re.match('[a-zA-Z]', word)) ])[-3:] while continueWriting: word_collocates = [] for w in cfdist[inputWord]: word_collocates.append(w) if counted_foot < foot - 2: word = random.choice(word_collocates) verse += word + ' ' else: rhyming_collocates = [ word for word in word_collocates if (word.endswith(rhyme) and self.count_foot(word) == 2 and word != inputWord) ] if not rhyming_collocates: rhyming_collocates = [ word for word in self.text if (word.endswith(rhyme) and self.count_foot(word) == 2 and word != inputWord) ] word = random.choice(rhyming_collocates) verse += word + ' ' continueWriting = False counted_foot += self.count_foot(word) verse = re.sub(' $', '', verse) return verse def compose_standard_poem(self, length): ''' Writes a poem with each verse starting with most commons words in master's work ''' poem = '' all_word_dist = nltk.FreqDist(w.lower() for w in self.text) mostcommon = all_word_dist.most_common(length) for word in [ x[0] for x in mostcommon if re.search('[a-zA-Z]', x[0]) is not None ]: verse = self.text_generator(word, title=title) poem += verse + '\n' return poem def compose_random_poem(self, length): ''' Writes a poem with each verse starting with random words from master's work ''' poem = '' for word in random.sample( [x for x in self.text if re.search('[a-zA-Z]', x) is not None], length): verse = self.text_generator(word, title=title) poem += verse + '\n' return poem def compose_prose_poem(self, length): ''' write a text that jumps to line after every n number of words, but is composed of one block only ''' final_work = "" first_word = random.choice([ w.lower() for w in self.text if re.search('[a-zA-Z]', w) is not None ]) paragraph = self.text_generator(word=first_word, num=length) paragraphlist = paragraph.split(' ') for i in range(1, len(paragraphlist)): final_work += paragraphlist[i] + ' ' if i % 10 == 0: final_work += '\n' return final_work def compose_rhyming_poem(self, length, foot): ''' write a poem that rhymes. For now we use a simple rhyming techniques : AABBCCDD (it's boring but it's simple) Also the phonetics aren't fully implemented yet, so for now rhymes are based on the 3 last letters from previous verse EXTRA tricky for french with all usual mute letters that we love so much ''' final_work = [] first_word = random.choice([ w.lower() for w in self.text if re.search('[a-zA-Z]', w) is not None ]) verse = self.rhyme_generator(inputWord=first_word, foot=12, rhyme=None) final_work.append(verse) first_word = verse.split(' ')[-1] previous_rhyme = verse[-3:] for i in range(2, length): if i % 2 != 0: verse = self.rhyme_generator(inputWord=first_word, rhyme=None) previous_rhyme = verse[-3:] final_work.append(verse) else: verse = self.rhyme_generator(inputWord=first_word, rhyme=previous_rhyme) final_work.append(verse) final_work = "\n".join(final_work) return final_work def find_title(self): ''' find the best title to capture the essence of his work, through random search into words ''' first_word = random.choice([ w.lower() for w in self.text if re.search('[a-zA-Z]', w) is not None ]) length = random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9]) title = self.text_generator(word=first_word, num=length) return title def draft_manuscript(self, title, func, **kwargs): ''' write a piece of text to a file, send it to everyone in town and wait for the letters of rejection ''' masterpiece = func(**kwargs) with open('failed_attempts/' + title + '.txt', 'w', encoding='utf-8') as manuscript: manuscript.write(masterpiece) signature = re.sub("(^[a-zA-Z])(/)([a-zA-Z])(/)(.)*?", "\3", self.master.capitalize()) manuscript.write('\n\n\t\t\t\t' + signature)
print("Usage: %s <filelist> <wordlist>" % (sys.argv[0])) sys.exit(1) files = [] with open(sys.argv[1], 'r') as fileList: files.extend([x for x in fileList.readlines() if not x.startswith("#")]) ROOT = "/home/ngilbert/xspace/data/" docs = [] for f in files: f=f.strip() f=f.replace(ROOT, "") docs.append(f+"/raw.txt") corpus = PlaintextCorpusReader(ROOT, docs) #sys.stderr.write(str(corpus.fileids())+"\n") unigrams = [token.lower() for token in corpus.words()] unigram_fd = nltk.FreqDist(unigrams) bigrams = nltk.bigrams(list(map(string.lower, corpus.words()))) #sys.stderr.write(str(len(bigrams))) bigram_fd = nltk.FreqDist(bigrams) #print bigram_fd #print bigram_fd[("the", "emotional")] #print unigram_fd["the"] #print bigrams sys.stderr.write(">>>finished counting unigrams and bigrams\n") wordlist = [] with open(sys.argv[2], 'r') as wordList: wordlist.extend([x.strip() for x in wordList.readlines()]) #process the documents, find the documents where each of these words
import os import word, pdf from nltk.corpus.reader.plaintext import PlaintextCorpusReader def getText(txtFileName): file = open(txtFileName, 'r') return file.read() newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) print(newCorpus.sents(newCorpus.fileids()[1])) print(newCorpus.paras(newCorpus.fileids()[0]))
import os import word, pdf from nltk.corpus.reader.plaintext import PlaintextCorpusReader def getText(txtFileName): file = open(txtFileName, 'r') return file.read() # 새로운 corpus 폴더 생성-디렉터리 newCorpusDir = 'mycorpus/' if not os.path.isdir(newCorpusDir): os.mkdir(newCorpusDir) txt1 = getText('sample_feed.txt') txt2 = pdf.getTextPDF('sample-pdf.pdf') txt3 = word.getTextWord('sample-one-line.docx') # 세 문자열 객체의 내용을 디스크에 파일로 작성(쓰기모드) files = [txt1, txt2, txt3] for idx, f in enumerate(files): with open(newCorpusDir + str(idx) + '.txt', 'w') as fout: fout.write(f) # 파일을 저장한 디렉터리에서 plaintext 객체 생성 newCorpus = PlaintextCorpusReader(newCorpusDir, '.*') print(newCorpus.words()) #0.txt 모든 단어 출력 print(newCorpus.sents(newCorpus.fileids()[1])) #1.txt 문장 출력 print(newCorpus.sents(newCorpus.fileids()[0])) #0.txt 단락별 출력
class DumbClusterer(): """A rather dumb clusterer. """ def __init__(self, corpus_dir=None, mwes=[], setup_mwes=True, **kwargs): self.mwes = mwes if corpus_dir is not None: self.setup_corpus(corpus_dir, '.*') if setup_mwes: self.setup_mwes(**kwargs) def setup_corpus(self, corpus_dir, paths='.*'): """Setting up a corpus. Args: corpus_dir(str): Path to corpus directory. """ self.corpus = PlaintextCorpusReader(corpus_dir, paths) return self.corpus def extract_expressions(self, document, features=None): """Returns expressions from given features and multi-word expressions. In addition to passing a document into this method, MWEs or Multi-Word Expressions can be given to treat some multi words as one expression. >>> from document import ArthurDocument >>> pdf_path = base_path + '/test/test.pdf' >>> with open(pdf_path, 'rb') as f: ... document = ArthurDocument(f.read()) >>> features = document.get_features()[730:816,:] >>> print(document.get_text(features)) # doctest:+ELLIPSIS VICTORIA'S CROWN JEWEL OF WATERFRONT ESTATES. Nestled on a quiet cove in the exclusive Multi-word expression should be detected: >>> clusterer = DumbClusterer(mwes=['crown jewel', 'waterfront estates']) >>> expressions = clusterer.extract_expressions(document, features) >>> print(expressions[2]['text']) CROWN JEWEL x position should equal x of "C" from "CROWN JEWEL" : >>> expressions[2]['x'] == features[11, ArthurDocument.get_feature_id('x')] True and width should equal to width of "CROWN JEWEL": >>> expr_width = expressions[2]['x1']-expressions[2]['x'] >>> ftr_width = features[21, ArthurDocument.get_feature_id('x1')] - features[11, ArthurDocument.get_feature_id('x')] >>> expr_width == ftr_width True Args: document(ArthurDocument): Document to extract data fields from. features(list): List of features containing data fields to extract. If not given, use all document features. mwes(list): List of Multi-Word Expressions. Example value: `['property type', 'single family)]`. With that list, both "property type" and "single family" will each be treated as single expressions. Returns: np.array: An array of data_fields. """ mwes = self.mwes if features is None: features = document.get_features() text = document.get_text(features) for idx, mwe in enumerate(mwes): if isinstance(mwe, str): mwes[idx] = word_tokenize(mwe.lower()) elif hasattr(mwe, '__iter__'): mwes[idx] = [x.lower() for x in mwe] tokenizer = MWETokenizer(mwes, separator=' ') tokenized = tokenizer.tokenize(word_tokenize(text.lower())) expressions = [] pos = 0 for token in tokenized: # token could be "deez nutz" but text contains multiple spaces e.g. "deez nutz", # so we need to split the token and find position of first and last characters. words = token.split() start_pos = text.lower().index(words[0], pos) for word in words: ipos = text.lower().index(word, pos) end_pos = ipos + len(word) pos = end_pos min_x = 0 max_x = 0 min_y = 0 max_y = 0 page = 0 if len(features[start_pos:end_pos,:] > 0): min_x = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x')] max_x = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('x1')] min_y = np.amin(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y')] max_y = np.amax(features[start_pos:end_pos,:], axis=0)[ArthurDocument.get_feature_id('y1')] page = features[start_pos, ArthurDocument.get_feature_id('page')] expressions.append({ 'text': text[start_pos:end_pos], 'x': min_x, 'x1': max_x, 'y': min_y, 'y1': max_y, 'page': page }) return expressions def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000): """Create multi-word expressions by learning a corpus located in a corpus directory. Testing setting up mwes with custom path and setting it up twice (correct when no exception): >>> corpus_dir = os.path.join(base_path, 'test', 'corpus') >>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe']) >>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000) >>> 'custom mwe' not in mwes True >>> 'custom mwe' in clusterer.mwes True Args: trigram_nbest(int): Number of highest ranked trigrams to acquire. bigram_nbest(int): Number of highest ranked trigrams to acquire. Returns: list: List of multi-word expressions. """ if self.corpus is None: raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.") bigram_measures = BigramAssocMeasures() trigram_measures = TrigramAssocMeasures() # Following are not used since ne chunk takes too much time. # Text processing before bigrams and trigrams calculated # words = [] # for sent in self.corpus.sents(): # for chunk in nltk.ne_chunk(nltk.pos_tag(sent)): # if not isinstance(chunk, nltk.Tree): # w = chunk[0] # # - Removal of words containing numbers or punctuations # if not any((ch.isdigit() or ch in string.punctuation) for ch in w): # # - Lowercasing all words # words.append(w.lower()) # print(w.lower().encode("utf-8")), # Text processing before bigrams and trigrams calculated words = [] for w in self.corpus.words(): # - Removal of words containing numbers or punctuations if not any((ch.isdigit() or ch in string.punctuation) for ch in w): # - Lowercasing all words words.append(w.lower()) bigram_finder = BigramCollocationFinder.from_words(words) trigram_finder = TrigramCollocationFinder.from_words(words) mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest) # Basically combining two list by turning them into sets to make sure union returned # i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they # need to be converted into sets. set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes} set2 = set(mwes) self.mwes = list(set1 | set2) return mwes
class Contract_Reader(): def __init__(self, config): print('Filepath for texts = ', config.textpath) self.corpus = PCR(config.textpath, '.*\.txt', encoding='utf-16', para_block_reader=read_line_block) if config.clean_paragraphs == 'yes': self.clean(config, mode='para') if config.clean_sentences == 'yes': self.clean(config, mode='sent') #Corpus summaries self.corpus_info() self.LDA(config.num_topics, config.num_words) self.plot(config.num_words) def clean(self, config, mode='sent'): stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WNL() if mode == 'para': #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings. self.para_list = [ list(itertools.chain.from_iterable(para)) for para in self.corpus.paras() ] for index, paragraph in enumerate(self.para_list): paragraph = " ".join(paragraph) stop_free = " ".join( [i for i in paragraph.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.para_list[index] = normalized print(self.para_list[0]) self.para_list = [para.split() for para in self.para_list] print(self.para_list[0]) if mode == 'sent': #Obtain list of strings each one a sentence rather than list of lists. self.sents_list = [" ".join(sent) for sent in self.corpus.sents()] for index, sentence in enumerate(self.sents_list): stop_free = " ".join( [i for i in sentence.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join( lemma.lemmatize(word) for word in punc_free.split()) self.sents_list[index] = normalized print(self.sents_list[0]) self.sents_list = [ sentence.split() for sentence in self.sents_list ] print(self.sents_list[0]) def LDA(self, num_topics, num_words): dictionary = corpora.Dictionary(self.para_list) doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list] path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623' self.ldamodel = LdaVowpalWabbit(path, doc_term_matrix, num_topics=num_topics, id2word=dictionary) self.ldamodel.save('model/lda_model') print(self.ldamodel.print_topics(num_topics=10, num_words=num_words)) def plot(self, num_words): for t in range(self.ldamodel.num_topics): plt.figure() tuples = [ reversed(x) for x in self.ldamodel.show_topic(t, num_words) ] plt.imshow(WordCloud().fit_words(dict(tuples))) plt.axis("off") plt.title("Topic #" + str(t)) plt.savefig('plots/topic' + str(t)) def corpus_info(self): """ Summary information about the status of a corpus. """ fids = len(self.corpus.fileids()) paras = len(self.corpus.paras()) sents = len(self.corpus.sents()) sperp = sum(len(para) for para in self.corpus.paras()) / float(paras) tokens = FreqDist(self.corpus.words()) count = sum(tokens.values()) vocab = len(tokens) lexdiv = float(count) / float(vocab) print( ("Text corpus contains {} files\n" "Composed of {} paragraphs and {} sentences.\n" "{:0.3f} sentences per paragraph\n" "Word count of {} with a vocabulary of {}\n" "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp, count, vocab, lexdiv))
from nltk.corpus.reader.plaintext import PlaintextCorpusReader filecontent1 = "This is a cow" filecontent2 = "This is a Dog" corpusdir = 'nltk_data/' with open(corpusdir + 'content1.txt', 'w') as text_file: text_file.write(filecontent1) with open(corpusdir + 'content2.txt', 'w') as text_file: text_file.write(filecontent2) text_corpus = PlaintextCorpusReader(corpusdir, ["content1.txt", "content2.txt"]) no_of_words_corpus1 = len(text_corpus.words("content1.txt")) print(no_of_words_corpus1) no_of_unique_words_corpus1 = len(set(text_corpus.words("content1.txt"))) no_of_words_corpus2 = len(text_corpus.words("content2.txt")) no_of_unique_words_corpus2 = len(set(text_corpus.words("content2.txt")))
class Corpus(object): def __init__(self, data_root): self.data_root = data_root self.data = PlaintextCorpusReader(data_root, '.*') self.words = [i for i in self.data.words() if i.isalpha()] self.text = Text(self.words) self.stop = set(stopwords.words('english')).union({ 'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv', 'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume', 'march', 'boston', 'table' }) with open('bib.json') as fi: self.bib = json.load(fi) def documents(self): """Return a list of all documents in the corpus""" return sorted([i for i in os.listdir(self.data_root)]) def words_in_file(self, filename): """Given a file, return a list of tokenized words""" try: text = self.data.open(filename).read() except FileNotFoundError: print("The file does not exist.") return word_tokenize(text) def sentences_in_file(self, filename): """Given a file, return a list of sentences""" try: text = self.data.open(filename).read() except FileNotFoundError: print("The file does not exist.") return sent_tokenize(text) def tokenized_sentences_in_file(self, filename): """Given a file name, return a list of word tokenized sentences""" try: text = self.data.open(filename).read() sent = [word_tokenize(s) for s in sent_tokenize(text)] except FileNotFoundError: print("The file does not exist.") return sent def most_frequent_content_words(self, n_words): """Return a list with the most frequent content words and their frequencies in (word, frequency) pairs ordered by frequency""" content_words = [ w for w in self.words if w.lower() not in self.stop and w.isalpha() and len(w) > 1 ] content_words_dict = FreqDist(content_words) return content_words_dict.most_common(n_words) def most_frequent_bigrams(self, n_bigrams): """Return a list with the most frequent bigrams of content words in the form of pairs where the first element is the bigram and the second is its frequency""" bigram_dict = FreqDist([k for k in bigrams(self.words)if k[0].isalpha() and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \ and k[0].lower() not in self.stop and k[1].lower() not in self.stop]) return bigram_dict.most_common(n_bigrams) def most_frequent_trigrams(self, n_trigrams): trigram_dict = FreqDist([k for k in trigrams(self.words)if k[0].isalpha() and k[1].isalpha() and len(k[0])>1 and len(k[1])>1 \ and k[0].lower() not in self.stop and k[1].lower() not in self.stop and k[2].lower() not in self.stop]) return trigram_dict.most_common(n_trigrams) def get_info(self, fileID): """Return metadata associate with a file indexed by the following fields: author, title, booktitle, year, publisher, pages, location, doi, url""" return self.bib[fileID] def print_reference(self, fileID): """Print metadata (author, title of paper, title of book, publishing year) associated with each file as a reference""" d = self.bib[fileID] print("%s. %s. %s, %s" % (' '.join( d['author'].split('\n')), d['title'], d['booktitle'], d['year'])) def concordance(self, word): self.text.concordance(word)
# nltk.download() # nltk.download('gutenberg') # text1.concordance("water") # print(FreqDist(text1).most_common(50)) # FreqDist(text1).plot(50, cumulative=True) # print(set(text1)) corpus_root = '/Users/devindyson/Desktop/nltk/corpora' corpora = PlaintextCorpusReader(corpus_root, '.*') # print(corpora.raw('meditations.txt')) # print(SentimentIntensityAnalyzer().polarity_scores("NLTK is pretty dope.")) print(sorted(corpora.fileids())) print(len(corpora.words('meditations.txt'))) print(len(corpora.words('benjamin.txt'))) meditations = Text(corpora.words('meditations.txt')) benjamin = Text(corpora.words('benjamin.txt')) def lexical_diversity(text_data): word_count = len(text_data) vocab_size = len(set(text_data)) diversity_score = vocab_size / word_count return diversity_score print(lexical_diversity(meditations)) print(lexical_diversity(benjamin))
corpusdir = 'newcorpus2/' if not os.path.isdir(corpusdir): os.mkdir(corpusdir) # *************************************************************************************************************************** # Reading the content of the file which is placed inside the directory newcorpus newcorpus = PlaintextCorpusReader('newcorpus/', '.*') print("This is the text file inside newcorpus directory:") print (newcorpus.raw()) # *************************************************************************************************************************** # Reading the content of the file which is placed inside the directory newcorpus2 newcorpus2 = PlaintextCorpusReader('newcorpus2/', '.*') print("This is the text file inside newcorpus2 directory:") print(newcorpus2.raw()) # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ###################################################################################################################### file_1_count=newcorpus.words() print print("Display of each word of the file inside the directory newcorpus:") print(file_1_count) #count the frequency distribution of each word in the text file fre_count_file_1= nltk.FreqDist(file_1_count) print print("Please see the frequency distribution of each word:") print(fre_count_file_1) most_common_word = fre_count_file_1.most_common(2) print print("See the most two common used words from the file:") print(most_common_word) ################################################################################################################# ######################################################################################################################
import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer segmentador = nltk.data.load("catalan.pickle") tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'DOGC-2015-cat.txt', word_tokenizer=tokenitzador, sent_tokenizer=segmentador) ocurrencies = corpus.words() tipus = set(ocurrencies) print("OCURRENCIES:", len(ocurrencies)) print("TIPUS:", len(tipus))
import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer segmentador = nltk.data.load("catalan.pickle") tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'DOGC-2015-cat.txt', word_tokenizer=tokenitzador, sent_tokenizer=segmentador) frequencia = {} for paraula in corpus.words(): frequencia[paraula] = frequencia.get(paraula, 0) + 1 for clau in frequencia.keys(): print(frequencia[clau], clau)
def detect(request): #Entrada de datos if request.method == 'POST': identificacion=request.POST.get('dni') a=request.FILES['document'] documento=str(a) datos_doc=documento.split('.') nombre_doc=datos_doc[0] tipo_doc=datos_doc[1] if tipo_doc=='txt': name=request.FILES['document'].read().lower() print(datos_doc) #mul=set(stopwords.words("spanish")) mul=codecs.open('mul.txt', "r", encoding='UTF-8').read() remove('muletillas.txt') discurso=(name.decode('UTF-8')) #Separar muletillas de palabras comunes text_completo = wordpunct_tokenize(discurso) m = [] m = [w for w in text_completo if w in mul] muletillas= codecs.open('muletillas.txt', "a") for i in m: muletillas.write(i) muletillas.write(" ") muletillas.close() #Contabilizar muletillas tokenizador=RegexpTokenizer('\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'muletillas.txt',word_tokenizer=tokenizador, encoding='Latin-1') frecuencia=FreqDist(corpus.words()) salida=codecs.open("muletillasR.txt","w",encoding="utf-8") palabras=[] repeticiones=[] #Agregar los datos extraidos en un txt para posterior presentacion for mc in frecuencia.most_common(): palabra=mc[0] frecuencia_absoluta=mc[1] frecuencia_relativa=frecuencia.freq(palabra) cadena=str(frecuencia_absoluta)+"\t"+str(frecuencia_relativa)+"\t"+palabra palabras.append(palabra.upper()) repeticiones.append(frecuencia_absoluta) salida.write(cadena+"\n") try: collection.insert_one({ 'identificacion':identificacion, 'documento': documento, 'discurso':discurso, 'muletillas':palabras }) except Exception as e: print("Error : ", type(e), e) #Enviado de datos al front context={ 'documento': nombre_doc, 'muletillas':palabras[0:10], 'repeticiones': repeticiones[0:10] } return render(request, 'responde.html', context) else : messages.warning(request, "Verifique el tipo de archivo", extra_tags='file') return render(request, 'home.html') return render(request, 'home.html') # class LineChartJSONView(BaseLineChartView): # def get_labels(): # """Return 7 labels for the x-axis.""" # return ["January", "February", "March", "April", "May", "June","July", "August", "September", "October"] # def get_providers(self): # """Return names of datasets.""" # return ["Repeticiones"] # def get_data(self): # """Return 3 datasets to plot.""" # return [[75, 44, 92, 11, 44, 95, 35, 11, 44, 95, 35]] # line_chart = TemplateView.as_view(template_name='responde.html') # line_chart_json = LineChartJSONView.as_view()
corpusdir = 'files/' # Directory of corpus. root = os.getcwd() newcorpus = PlaintextCorpusReader(corpusdir, '.*',encoding="latin-1") print(len(onlyfiles)) fhand = open('stopWords.txt', 'r') stopWords = fhand.read() stopWords = stopWords.split('\n') is_noun = lambda pos: pos[:2] == 'NN' is_adject = lambda pos: pos[:2] == 'JJ' for file in onlyfiles: print(file) text = newcorpus.words(file) print(nltk.pos_tag(text)) print(len(text)) filename = root + "/coupusFiles/" + file print(filename) f = open(filename, 'w') for words in text: print(is_noun(words) # if is_noun(words): # if words.lower() not in stopWords: # f.write(words) # f.write("\n") # if is_adject(words): # if words.lower() not in stopWords: # f.write(words) # f.write("\n")
import nltk import os import glob from os.path import join from nltk.collocations import * from nltk.corpus.reader.plaintext import PlaintextCorpusReader bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() # read in corpus, find all the 3-grams above the min frequency print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION) print "Read in " + str(len(my_corpus.fileids())) + " files" print "Finding 3-grams" finder_3gram = TrigramCollocationFinder.from_words(my_corpus.words()) print "Filtering out 3-grams of frequency less than", MIN_FREQUENCY finder_3gram.apply_freq_filter(MIN_FREQUENCY) # combine all the 3-grams meeting the PMI threshold print "Looking for 3-grams with a PMI of at least ", MIN_3GRAM_PMI filelist = [f for f in glob.glob(CORPUS_ROOT + CORPUS_OUTPUT_EXTENSION)] gen = finder_3gram.above_score(trigram_measures.pmi, MIN_3GRAM_PMI) processGrams(gen, filelist) # now let's do the same for the 2-grams # our previous step altered the corpus so let's read it in again print "Reading in corpus from", CORPUS_ROOT my_corpus = PlaintextCorpusReader(CORPUS_ROOT, CORPUS_EXTENSION) print "Finding 2-grams"
#STEP 1 # This is the variable name for the target file to read. Note it is useful to copy and paste all from # .PDF into a .TXT file to read File_to_Read = 'Sample_from_PDF.txt' # Read file from nltk.corpus.reader.plaintext import PlaintextCorpusReader # Read file corpus = PlaintextCorpusReader(os.getcwd(), File_to_Read) #print(corpus.raw()) # Counts total sentences in document and creates a list of words in document sentences = corpus.sents() print("\n Total sentences in this corpus : ", len(sentences)) print("\n Words in this corpus : ", corpus.words()) # Finds frequency distribution of words in document course_freq_dist = nltk.FreqDist(corpus.words()) print("\n Top 30 words in the corpus : ", course_freq_dist.most_common(30)) # Calculate distribution for a specific word print("\n Distribution for \"hydrogen\" : ", course_freq_dist.get('hydrogen')) # Tokenization # Read base file into raw text variable base_file = open(os.getcwd() + "/" + File_to_Read, mode='rt', encoding='utf-8') raw_text = base_file.read() base_file.close()
class TextAnalizer: def __init__(self, my_input_file): self.config = configparser.ConfigParser() self.config.read("text_analysis.cfg") self.input_file = my_input_file self.nlp_model = self.config["DEFAULT"]["nlp_model"] #The output file name self.output_file = self.config["DEFAULT"]["output_file"] self.nlp = load_nlp(self.nlp_model) self.corpus = CorpusReader(".", self.input_file) self.raw_text = self.corpus.raw() self.nlp_text = self.nlp(self.raw_text) # Here, lets put together the infos for text analysis with spacy. self.analysis_dictionary = Counter() self.word_count = 0 self.get_word_count_nltk() def get_paragraph(self): return self.corpus.paras() def get_sentence(self): return self.corpus.sents() def get_word(self): return self.corpus.words() def get_word_count_nltk(self): tokenizer = Tokenizer(r'\w+') counts = Counter() sentences = self.get_sentence() for sentence in sentences: tokens = tokenizer.tokenize(" ".join(sentence)) self.word_count = self.word_count + len(tokens) filtered = [w for w in sentence if w.isalnum()] counts = counts + Counter(filtered) return counts, self.word_count def analize_nlp(self): analized_data_str = (self.config["ANALIZED"]["POS"]) analized_data = (analized_data_str.split(",")) result_dict = {} diff_str, tot_str = ( self.config["DEFAULT"]["diff_tot_string"]).split(",") lemma_counter = Counter() pos_counter = Counter() tag_counter = Counter() for token in self.nlp_text: lemma_counter = lemma_counter + Counter([token.lemma_]) pos_counter = pos_counter + Counter([token.pos_]) tag_counter = tag_counter + Counter([token.tag_]) my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_ self.analysis_dictionary[my_key] += 1 for pos in analized_data: instance_counter = 0 total_counter = 0 for key in self.analysis_dictionary.keys(): try: my_lemma, my_tag, my_pos = key.split("_") except ValueError: print("Warning: Array has a empty line") # add logging if pos == my_pos: instance_counter += 1 total_counter = total_counter + self.analysis_dictionary.get( key) result_dict[pos + diff_str] = instance_counter result_dict[pos + tot_str] = total_counter #add the stuff from nltk diff_word, word_count = self.get_word_count_nltk() result_dict["WORDS" + tot_str] = word_count result_dict["WORDS" + diff_str] = len(diff_word) result_dict["PARAGRAPHS"] = len(self.get_paragraph()) result_dict["SENTENCES"] = len(self.get_sentence()) return result_dict def write_output(self): with open(self.output_file, "w+") as f: f.write("Number of paragraphes: " + str(len(self.get_paragraph())) + "\n") f.write("Number of sentences: " + str(len(self.get_sentence())) + "\n") f.write("Number of words: " + str(self.word_count) + "\n") f.write("Average words per sentence: " + str(round(self.word_count / len(self.get_sentence()), 2)) + "\n") f.write("Number of different words: " + str(len(self.get_word_count_nltk())) + "\n") f.write("Text variety (different words/total words: " + str( round(len(self.get_word_count_nltk()) / self.word_count, 2)) + "\n") f.close()
import nltk import numpy as np from nltk.corpus.reader.plaintext import PlaintextCorpusReader # Get raw text as string. corpusdir = 'lyrics/' # Directory of corpus. oLedZeppelinCorpus = PlaintextCorpusReader(corpusdir, '.*') lCorpus = oLedZeppelinCorpus.words() # every word in the corpus def make_pairs(lCorpus): for i in range(len(lCorpus) - 1): yield (lCorpus[i], lCorpus[i + 1]) pairs = make_pairs(lCorpus) word_dict = {} for word_1, word_2 in pairs: if word_1 in word_dict.keys(): word_dict[word_1].append(word_2) else: word_dict[word_1] = [word_2] first_word = np.random.choice(lCorpus) while first_word.islower(): first_word = np.random.choice(lCorpus) chain = [first_word]
from nltk.tokenize import word_tokenize, sent_tokenize import nltk import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.probability import FreqDist #Create a corpus corpusdir = "/home/erdinc/nltk/cs290f_proj/tos/" newcorpus = PlaintextCorpusReader(corpusdir, '.*') corpusWords = nltk.Text(newcorpus.words()) posTags = nltk.pos_tag(corpusWords) #Total number of words in corpus def getTotalNumberOfWords(words): return len(words) #Number of unique words in corpus def getNumberOfUniqueWords(words): return len(set(words)) #Most frequently used 25 words def getMostFreqWords(words): fdist = FreqDist(words) vocab = fdist.keys() return vocab[:25] #Name List def getNameList(tags): nameList = []
from nltk.corpus.reader.plaintext import PlaintextCorpusReader import nltk # Might need the below line once # nltk.download('punkt') corpusDir = 'own_corpus/' newCorpus = PlaintextCorpusReader(corpusDir, '.*\.txt') for file in sorted(newCorpus.fileids()): words = newCorpus.words(file) text = nltk.Text(words) print(text)
import nltk from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.tokenize import RegexpTokenizer from nltk import FreqDist segmentador = nltk.data.load("catalan.pickle") tokenitzador = RegexpTokenizer('[ldsmLDSM]\'|\w+|[^\w\s]+') corpus = PlaintextCorpusReader(".", 'DOGC-2015-cat.txt', word_tokenizer=tokenitzador, sent_tokenizer=segmentador) frequencia = FreqDist(corpus.words()) for mc in frequencia.most_common(): print(mc)
# Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0]) # Access sentences in the corpus. (list of list of strings) # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print # To access sentences of a specific fileid. print newcorpus.sents(newcorpus.fileids()[0]) # Access just tokens/words in the corpus. (list of strings) print newcorpus.words() # To access tokens of a specific fileid. print newcorpus.words(newcorpus.fileids()[0])
# Access the plaintext; outputs pure string/basestring. print newcorpus.raw().strip() print # Access paragraphs in the corpus. (list of list of list of strings) # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and # nltk.tokenize.word_tokenize. # # Each element in the outermost list is a paragraph, and # Each paragraph contains sentence(s), and # Each sentence contains token(s) print newcorpus.paras() print # To access pargraphs of a specific fileid. print newcorpus.paras(newcorpus.fileids()[0]) # Access sentences in the corpus. (list of list of strings) # NOTE: That the texts are flattened into sentences that contains tokens. print newcorpus.sents() print # To access sentences of a specific fileid. print newcorpus.sents(newcorpus.fileids()[0]) # Access just tokens/words in the corpus. (list of strings) print newcorpus.words() # To access tokens of a specific fileid. print newcorpus.words(newcorpus.fileids()[0])