# Contar quantidade de espaços len(re.findall('\s+', text)) # Capitulo I with open("bases/Iracema-jose-de-alencar-Cap1.txt") as c1: Cap1 = c1.read() Cap1 = remover_acentos(Cap1) # Remove os espaços em branco # Capítulo I Cap1SE = re.sub(r'\s', '', Cap1) # N-Gram (Treinamento?) allMyWords = Cap1.split() Cap1_nGram = obo.getNGrams(allMyWords, 3) # TriGram print(obo.getNGrams(allMyWords, 3)) # Usando Markov import random class MarkovChain: def __init__(self): self.memory = {} def _learn_key(self, key, value): if key not in self.memory: self.memory[key] = [] self.memory[key].append(value)
[sent[i:i+n] for i in range(len(sent)-n+1)] ########## # N-Gram # ########## # http://www.ling.helsinki.fi/kit/2008s/clt231/nltk-0.9.5/doc/en/book.html#n_gram_tagger_index_term # http://tetration.xyz/Ngram-Tutorial/ # https://programminghistorian.org/lessons/keywords-in-context-using-n-grams import obo wordstring = 'it was the best of times it was the worst of times ' wordstring += 'it was the age of wisdom it was the age of foolishness' allMyWords = wordstring.split() print(obo.getNGrams(allMyWords, 3)) # Detalhes wordfreq = [] for w in allMyWords: wordfreq.append(allMyWords.count(w)) print("-> DETALHES <-") print("String\n" + wordstring +"\n") print("List\n" + str(allMyWords) + "\n") print("Frequencies\n" + str(allMyWords) + "\n") print("Pairs\n" + str(zip(allMyWords, wordfreq))) ############################# # Separa e junta caracteres # #############################
# html-to-kwic.py import obo # create dictionary of n-grams n = 7 url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' text = obo.webPageToText(url) fullwordlist = ('# ' * (n//2)).split() fullwordlist += obo.stripNonAlphaNum(text) fullwordlist += ('# ' * (n//2)).split() ngrams = obo.getNGrams(fullwordlist, n) worddict = obo.nGramsToKWICDict(ngrams) # output KWIC and wrap with html target = 'black' outstr = '<pre>' if worddict.has_key(target): for k in worddict[target]: outstr += obo.prettyPrintKWIC(k) outstr += '<br />' else: outstr += 'Keyword not found in source' outstr += '</pre>' obo.wrapStringInHTMLMac('html-to-kwic', url, outstr)
#get-keywords.py import obo test = 'this test sentence has eight words in it' ngrams = obo.getNGrams(test.split(), 5) print(obo.nGramsToKWICDict(ngrams))