コード例 #1
0
ファイル: nltk_funcs.py プロジェクト: ssami/python
def concordance(text, word):
    cindex = ConcordanceIndex(text, key=lambda x: x.lower())
    offsetList = cindex.offsets(word)
    contexts = []
    for i in offsetList[:10]:
        pre = i - 10
        post = i + 10
        contextStr = " ".join(text[pre:post])
        contexts.append(contextStr)
    return contexts
コード例 #2
0
ファイル: nltk_funcs.py プロジェクト: ssami/python
def concordance(text, word):
    cindex = ConcordanceIndex(text, key=lambda x:x.lower())
    offsetList = cindex.offsets(word)
    contexts = []
    for i in offsetList[:10]:
        pre = i-10
        post = i+10
        contextStr = ' '.join(text[pre:post])
        contexts.append(contextStr)
    return contexts
コード例 #3
0
def getContext(vocabTokenizado,
               contextos=False):  #Con ésta función puedo tokenizar al 100%.
    #Es una imitación de la función .concordance() Aprovéchenla :D
    contextoIzq = []
    contextoDer = []
    ci = ConcordanceIndex(vocabTokenizado)  #Hashea todos los tokens
    palabra = input("\n\nIntroduce la palabra a comparar: ")
    palabra = palabra.replace('\n', '')
    palabra = palabra.lower()
    resultados = concordance(ci, palabra)

    if contextos != False:  #Si no se estipula que se desean regresar los contextos, no ejecuta el siguiente bloque y se salta hasta el return resultados
        palAnt = 0
        palSig = ""

        for renglon in resultados:
            palAnt = 0
            renglon = renglon.split()
            for w in renglon:
                w = w.lower()
                if w == palabra:
                    break  #El ciclo se rompe cuando la palabra es igual
                palAnt += 1  #El último valor que guardará es la posicion de w
            #print("len(renglon)->"+str(len(renglon)))
            #print("pal->"+str(palAnt))

            contextoIzq.append(renglon[palAnt - 1])
            contextoDer.append(renglon[palAnt + 1])
        return [
            contextoIzq, contextoDer, resultados
        ]  #Descomentar si se desea obtener todo el contexto izq y derecho + el resultado.
    return resultados
コード例 #4
0
def concordanceFunction(data):
    showedText2.delete('1.0', END)
    arrCon = ConcordanceIndex(nltk.Text(dataAnalysisCorpus.words()))
    arr = concordance(arrCon, data)

    for i in arr:
        showedText2.insert(END, i + '\n')
コード例 #5
0
ファイル: philology.py プロジェクト: Akirato/cltk
    def _build_concordance(self, text_str):
        """
        Inherit or mimic the logic of ConcordanceIndex() at http://www.nltk.org/_modules/nltk/text.html
        and/or ConcordanceSearchView() & SearchCorpus() at https://github.com/nltk/nltk/blob/develop/nltk/app/concordance_app.py
        :param text_string: Text to be turned into a concordance
        :type text_string: str
        :return: list
        """
        p = PunktLanguageVars()
        orig_tokens = p.word_tokenize(text_str)
        c = ConcordanceIndex(orig_tokens)

        #! rm dupes after index, before loop
        tokens = set(orig_tokens)
        tokens = [x for x in tokens if x not in [',', '.', ';', ':', '"', "'", '[', ']']]  # this needs to be changed or rm'ed

        return c.return_concordance_all(tokens)
コード例 #6
0
ファイル: fn2.py プロジェクト: denfer57/ProjetTal
def phrasesContenant_bkp(corpus, mots):
    """
	:DEPRECATED utiliser phrasesContenant() à la place
	"""
    print("début phrasesContenant()")
    phrasesContenant = []
    index = ConcordanceIndex(corpus)

    #Pour chaque mot
    for mot in mots:
        positions = index.offsets(mot)
        print("positions:", positions)
        for position in positions:
            phrasesContenant.append(corpus[position])

    print("nb tokens:", len(phrasesContenant))
    #	print("tokens:",phrasesContenant)
    print("fin phrasesContenant()")
    return phrasesContenant
コード例 #7
0
ファイル: philology.py プロジェクト: linearregression/cltk
    def _build_concordance(self, text_str):
        """
        Inherit or mimic the logic of ConcordanceIndex() at http://www.nltk.org/_modules/nltk/text.html
        and/or ConcordanceSearchView() & SearchCorpus() at https://github.com/nltk/nltk/blob/develop/nltk/app/concordance_app.py
        :param text_string: Text to be turned into a concordance
        :type text_string: str
        :return: list
        """
        p = PunktLanguageVars()
        orig_tokens = p.word_tokenize(text_str)
        c = ConcordanceIndex(orig_tokens)

        #! rm dupes after index, before loop
        tokens = set(orig_tokens)
        tokens = [
            x for x in tokens
            if x not in [',', '.', ';', ':', '"', "'", '[', ']']
        ]  # this needs to be changed or rm'ed

        return c.return_concordance_all(tokens)
コード例 #8
0
def form_example():
    if request.method == 'POST':
        term = request.form.get('term')
        ci = ConcordanceIndex(corpus.tokens)
        results = concordance(ci, term)

        return '''<h1>The concordance result is:</h1>
            {}
        <br>
        Click <a href='/'>here</a> to try again'''.format(results)

    return '''<form method="POST">
コード例 #9
0
ファイル: views.py プロジェクト: ellfiscina/EmotionAnalysis
def context(request):
    error = False
    tokens = tokenize(request.session['raw'])

    if 'list' not in request.session:
        job = queue.fetch_job(request.session['jid'])
        # tagged = tags_to_token(request.session['raw'])
        # emoList = NewList(filter_words(negations(tagged)), EMOLEX)
        emoList = job.result
        job.delete()
        request.session['list'] = emoList
    else:
        emoList = request.session['list']

    text = convert_to_text(tokens)
    filtered = filter_words(tokens)
    max_token = max_dist(emoList)
    ngrams = n_grams(text, max_token, 5)
    colls = collocations(filtered)
    context = concordance(ConcordanceIndex(tokens), max_token)

    if request.method == 'POST':
        tree = getDict(text, request.POST['word'])
        if not tree['name']:
            error = True
    else:
        tree = getDict(text, max_token)

    return render(
        request, 'text_mining/context.html', {
            'max': max_token,
            'ngrams': random.sample(ngrams, 10),
            'collocations': colls,
            'context': context,
            'treeword': tree,
            'error': error
        })
コード例 #10
0
ファイル: parser-concordance.py プロジェクト: eldams/daba
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import test, wl, wl_detone
from nltk.text import ConcordanceIndex
from orthograph import convertw, detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

ci = ConcordanceIndex(test.words(), key=lambda s: s.lower())
types = list(set([s.lower() for s in set(test.words())]))
types.sort()

for word in types:
    if not re.search(r'[0-9.,;:!?]', word):
        ci.print_concordance(word, lines=15)
        print
        nw = convertw(word)
        nwl = [w for w in nw if w in wl]
        if nwl:
            formlist = nwl
        else:
            formlist = nw
        result = []
        for form in formlist:
            if form != detone(form):
                stage, gl = lemmatize(form, wl)
            else:
                stage, gl = lemmatize(form, wl_detone)
            result.extend(gl)
コード例 #11
0
ファイル: parser-concordance.py プロジェクト: Mompolice/daba
#!/usr/bin/python
# -*- encoding: utf-8 -*-

from bamana import test,wl,wl_detone
from nltk.text import ConcordanceIndex
from orthograph import convertw,detone
from morphology import lemmatize, dict_disambiguate, print_gloss
import re

ci = ConcordanceIndex(test.words(), key=lambda s:s.lower())
types = list(set([s.lower() for s in set(test.words())]))
types.sort()

for word in types:
    if not re.search(r'[0-9.,;:!?]', word):
        ci.print_concordance(word, lines=15)
        print 
        nw = convertw(word)
        nwl = [w for w in nw if w in wl]
        if nwl:
            formlist = nwl
        else:
            formlist = nw
        result = []
        for form in formlist:
            if form != detone(form):
                stage, gl = lemmatize(form, wl)
            else:
                stage, gl = lemmatize(form,wl_detone)
            result.extend(gl)
コード例 #12
0
from nltk.text import ConcordanceIndex

from tokenizer import tokenize_string
from util import scan_all_files

if __name__ == '__main__':
    parser = ArgumentParser(description='Simple NLTK-based concordancer.')
    parser.add_argument('root_dir', help='Root directory to scan for files.')
    parser.add_argument('word_regex')
    parser.add_argument('-w', '--width', type=int, default=80)
    args = parser.parse_args()
    print(args)
    word_regex = re.compile(args.word_regex, flags=re.IGNORECASE)
    file_paths = scan_all_files(args.root_dir)
    for file_path in file_paths:
        with open(file_path) as f:
            tokens = tokenize_string(f.read())
            concordance_index = ConcordanceIndex(tokens,
                                                 key=lambda s: s.lower())
            for search_token in filter(word_regex.fullmatch,
                                       concordance_index._offsets):
                concordance_list = concordance_index.find_concordance(
                    search_token, width=args.width)
                if concordance_list:
                    for concordance_line in concordance_list:
                        print(" ".join([
                            concordance_line.left_print,
                            concordance_line.query.upper(),
                            concordance_line.right_print
                        ]))
コード例 #13
0
print("Type: 'texts()' to list the materials.")

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sao Paulo (1994)")

machado_fileids = machado.fileids()
machado_words = machado.words(
    ['romance/marm05.txt', 'cronica/macr04.txt', 'critica/mact15.txt'])
#machado_words = machado.words(machado_fileids) + mac_morpho.words('mu94se01.txt') + genesis.words('portuguese.txt')
machado_text = Text(machado_words)
machado_ci = ConcordanceIndex(machado_text)


def texts():
    print("ptext1:", ptext1.name)
    print("ptext2:", ptext2.name)
    print("ptext3:", ptext3.name)
    print("ptext4:", ptext4.name)


def common_vocab(text, n=50):
    words = [word.lower() for word in text if word.isalpha()]
    fdist = FreqDist(words)
    common = fdist.most_common(n)
    word_list = [w for (w, n) in common]
    return word_list
コード例 #14
0
ファイル: text.py プロジェクト: peithous/PerseusNLPToolkit
 def __init__(self, tokens, cites, key=lambda x: x):
     ConcordanceIndex.__init__(self, tokens, key)
     if len(tokens) != len(cites):
         raise ValueError("Tokens and citations do not seem to match")
     self._cites = cites