def concordance(text, word): cindex = ConcordanceIndex(text, key=lambda x: x.lower()) offsetList = cindex.offsets(word) contexts = [] for i in offsetList[:10]: pre = i - 10 post = i + 10 contextStr = " ".join(text[pre:post]) contexts.append(contextStr) return contexts
def concordance(text, word): cindex = ConcordanceIndex(text, key=lambda x:x.lower()) offsetList = cindex.offsets(word) contexts = [] for i in offsetList[:10]: pre = i-10 post = i+10 contextStr = ' '.join(text[pre:post]) contexts.append(contextStr) return contexts
def getContext(vocabTokenizado, contextos=False): #Con ésta función puedo tokenizar al 100%. #Es una imitación de la función .concordance() Aprovéchenla :D contextoIzq = [] contextoDer = [] ci = ConcordanceIndex(vocabTokenizado) #Hashea todos los tokens palabra = input("\n\nIntroduce la palabra a comparar: ") palabra = palabra.replace('\n', '') palabra = palabra.lower() resultados = concordance(ci, palabra) if contextos != False: #Si no se estipula que se desean regresar los contextos, no ejecuta el siguiente bloque y se salta hasta el return resultados palAnt = 0 palSig = "" for renglon in resultados: palAnt = 0 renglon = renglon.split() for w in renglon: w = w.lower() if w == palabra: break #El ciclo se rompe cuando la palabra es igual palAnt += 1 #El último valor que guardará es la posicion de w #print("len(renglon)->"+str(len(renglon))) #print("pal->"+str(palAnt)) contextoIzq.append(renglon[palAnt - 1]) contextoDer.append(renglon[palAnt + 1]) return [ contextoIzq, contextoDer, resultados ] #Descomentar si se desea obtener todo el contexto izq y derecho + el resultado. return resultados
def concordanceFunction(data): showedText2.delete('1.0', END) arrCon = ConcordanceIndex(nltk.Text(dataAnalysisCorpus.words())) arr = concordance(arrCon, data) for i in arr: showedText2.insert(END, i + '\n')
def _build_concordance(self, text_str): """ Inherit or mimic the logic of ConcordanceIndex() at http://www.nltk.org/_modules/nltk/text.html and/or ConcordanceSearchView() & SearchCorpus() at https://github.com/nltk/nltk/blob/develop/nltk/app/concordance_app.py :param text_string: Text to be turned into a concordance :type text_string: str :return: list """ p = PunktLanguageVars() orig_tokens = p.word_tokenize(text_str) c = ConcordanceIndex(orig_tokens) #! rm dupes after index, before loop tokens = set(orig_tokens) tokens = [x for x in tokens if x not in [',', '.', ';', ':', '"', "'", '[', ']']] # this needs to be changed or rm'ed return c.return_concordance_all(tokens)
def phrasesContenant_bkp(corpus, mots): """ :DEPRECATED utiliser phrasesContenant() à la place """ print("début phrasesContenant()") phrasesContenant = [] index = ConcordanceIndex(corpus) #Pour chaque mot for mot in mots: positions = index.offsets(mot) print("positions:", positions) for position in positions: phrasesContenant.append(corpus[position]) print("nb tokens:", len(phrasesContenant)) # print("tokens:",phrasesContenant) print("fin phrasesContenant()") return phrasesContenant
def _build_concordance(self, text_str): """ Inherit or mimic the logic of ConcordanceIndex() at http://www.nltk.org/_modules/nltk/text.html and/or ConcordanceSearchView() & SearchCorpus() at https://github.com/nltk/nltk/blob/develop/nltk/app/concordance_app.py :param text_string: Text to be turned into a concordance :type text_string: str :return: list """ p = PunktLanguageVars() orig_tokens = p.word_tokenize(text_str) c = ConcordanceIndex(orig_tokens) #! rm dupes after index, before loop tokens = set(orig_tokens) tokens = [ x for x in tokens if x not in [',', '.', ';', ':', '"', "'", '[', ']'] ] # this needs to be changed or rm'ed return c.return_concordance_all(tokens)
def form_example(): if request.method == 'POST': term = request.form.get('term') ci = ConcordanceIndex(corpus.tokens) results = concordance(ci, term) return '''<h1>The concordance result is:</h1> {} <br> Click <a href='/'>here</a> to try again'''.format(results) return '''<form method="POST">
def context(request): error = False tokens = tokenize(request.session['raw']) if 'list' not in request.session: job = queue.fetch_job(request.session['jid']) # tagged = tags_to_token(request.session['raw']) # emoList = NewList(filter_words(negations(tagged)), EMOLEX) emoList = job.result job.delete() request.session['list'] = emoList else: emoList = request.session['list'] text = convert_to_text(tokens) filtered = filter_words(tokens) max_token = max_dist(emoList) ngrams = n_grams(text, max_token, 5) colls = collocations(filtered) context = concordance(ConcordanceIndex(tokens), max_token) if request.method == 'POST': tree = getDict(text, request.POST['word']) if not tree['name']: error = True else: tree = getDict(text, max_token) return render( request, 'text_mining/context.html', { 'max': max_token, 'ngrams': random.sample(ngrams, 10), 'collocations': colls, 'context': context, 'treeword': tree, 'error': error })
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import test, wl, wl_detone from nltk.text import ConcordanceIndex from orthograph import convertw, detone from morphology import lemmatize, dict_disambiguate, print_gloss import re ci = ConcordanceIndex(test.words(), key=lambda s: s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form, wl_detone) result.extend(gl)
#!/usr/bin/python # -*- encoding: utf-8 -*- from bamana import test,wl,wl_detone from nltk.text import ConcordanceIndex from orthograph import convertw,detone from morphology import lemmatize, dict_disambiguate, print_gloss import re ci = ConcordanceIndex(test.words(), key=lambda s:s.lower()) types = list(set([s.lower() for s in set(test.words())])) types.sort() for word in types: if not re.search(r'[0-9.,;:!?]', word): ci.print_concordance(word, lines=15) print nw = convertw(word) nwl = [w for w in nw if w in wl] if nwl: formlist = nwl else: formlist = nw result = [] for form in formlist: if form != detone(form): stage, gl = lemmatize(form, wl) else: stage, gl = lemmatize(form,wl_detone) result.extend(gl)
from nltk.text import ConcordanceIndex from tokenizer import tokenize_string from util import scan_all_files if __name__ == '__main__': parser = ArgumentParser(description='Simple NLTK-based concordancer.') parser.add_argument('root_dir', help='Root directory to scan for files.') parser.add_argument('word_regex') parser.add_argument('-w', '--width', type=int, default=80) args = parser.parse_args() print(args) word_regex = re.compile(args.word_regex, flags=re.IGNORECASE) file_paths = scan_all_files(args.root_dir) for file_path in file_paths: with open(file_path) as f: tokens = tokenize_string(f.read()) concordance_index = ConcordanceIndex(tokens, key=lambda s: s.lower()) for search_token in filter(word_regex.fullmatch, concordance_index._offsets): concordance_list = concordance_index.find_concordance( search_token, width=args.width) if concordance_list: for concordance_line in concordance_list: print(" ".join([ concordance_line.left_print, concordance_line.query.upper(), concordance_line.right_print ]))
print("Type: 'texts()' to list the materials.") ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)") ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)") ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis") ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sao Paulo (1994)") machado_fileids = machado.fileids() machado_words = machado.words( ['romance/marm05.txt', 'cronica/macr04.txt', 'critica/mact15.txt']) #machado_words = machado.words(machado_fileids) + mac_morpho.words('mu94se01.txt') + genesis.words('portuguese.txt') machado_text = Text(machado_words) machado_ci = ConcordanceIndex(machado_text) def texts(): print("ptext1:", ptext1.name) print("ptext2:", ptext2.name) print("ptext3:", ptext3.name) print("ptext4:", ptext4.name) def common_vocab(text, n=50): words = [word.lower() for word in text if word.isalpha()] fdist = FreqDist(words) common = fdist.most_common(n) word_list = [w for (w, n) in common] return word_list
def __init__(self, tokens, cites, key=lambda x: x): ConcordanceIndex.__init__(self, tokens, key) if len(tokens) != len(cites): raise ValueError("Tokens and citations do not seem to match") self._cites = cites