コード例 #1
0
ファイル: QuestionParser.py プロジェクト: aludi/langTech
 def parse_regex(self):
     for key in self.specs.patterns[
             'triples']:  ##specs is a dict with regex pattern as key, and order of arguments as value
         #print(key)
         matchObj = re.search(key, self.question, re.M | re.I | re.U)
         if matchObj:
             #print("expression found")
             triple = [
                 TextNormalizer(
                     matchObj.group(1)).allowedTagKeeper('noun_adjective'),
                 TextNormalizer(
                     matchObj.group(2)).allowedTagKeeper('noun_adjective'),
                 ""
             ]  ##instead of complicated regex, i remove everything from a group that is not a noun, we know that only those are meaningful in wikidata IDs
             ##an empty element is added in the end as a placeholder for the variable, that is obviously not in the text, for the sake of similar indexing with the order in specs
             #print(triple)
             T = Triple(triple, self.specs.patterns['triples'][key])
             self.variable = T.variable  ##set the question variables to be equal to the triple variables TODO: selection in multiple triples
             self.targetVariable = T.targetVariable
             self.query_list.append(T.SQL)
コード例 #2
0
    text = tcleaner.clean_code(text)
    text = tcleaner.clean_text(text)
    tcleaner.output(text)

    tokenizer = Tokenizer.JanomeTokenizer()
    words = tokenizer.wakati(text)
    #words = tokenizer.filter_by_pos(text, pos=('名詞'))
    tokenizer.output(words)

    #MeCab
    #tokenizer = Tokenizer.MeCabTokenizer()
    #words = tokenizer.wakati(text)
    #words = tokenizer.filter_by_pos(text, pos=('名詞'))
    #tokenizer.output(words)

    tnormalizer = TextNormalizer.TextNormalizer()
    nwords = []
    for w in words:
        nw = tnormalizer.normalize(w)
        nw = tnormalizer.lemmatize_term(nw, pos='v')
        nwords.append(nw)
    tnormalizer.output(nwords)

    stw_remover = StopwordRemover.StopwordRemover()
    stw_remover.load_stopword_file("./slothlib/stopwords.txt")
    stw_remover.load_stopword_file("./slothlib/stopwords_extend.txt")
    stw_remover.find_stopwords(nwords)
    stwords = stw_remover.remove_stopwords(nwords)
    stwords = stw_remover.remove_noisewords(stwords)

    stw_remover.output(stwords)
コード例 #3
0
ファイル: tests.py プロジェクト: yatrex/TT
import ModeloVectorial
import TextNormalizer
import UsualTools
import os
from ModeloVectorial import *
from TextNormalizer import *
from UsualTools import *
maxLen = 1000000  # Maxima mongitud de caracteres soportados por libro
exp = TextNormalizer()
llemmas = []
#----------------------------------------------------- Obtiene objetos tipo libros y los guardar -----------------------------------
#books = UsualTools.getLibros("./Libros de Goodreads")
#UsualTools.saveObject(books,"./Recursos/BooksList.json")
#-----------------------------------------------------
books = UsualTools.loadObject("./Recursos/BooksList.json")
exp.setVocabulary(UsualTools.loadObject("./Recursos/vocabulary.json"))
#----------------------------------------------------- Lemmatiza los libros y los guarda -------------------------------------------

#for book in books:
#	if (not (str(book.num)+".json" in os.listdir("./Recursos/lemmas/")) ):
#		print("-----------------")
#		print("Titulo:",book.nombre)
#		book.texto = exp.delExtraInfoPG("./Libros de Goodreads/"+str(book.num)+".txt")
#		book.texto = exp.deleteSpecialChars(book.texto)
#		booktam = len(book.texto)
#		lemmas = []
#		if (len(book.texto) < maxLen):#1000000 es el numero maximo de caracteres soportada por cada procesamiento
#			exp.setText(book.texto)
#			lemmas= exp.lemmatize_delSW()
#		else:#Si excede el numero de caracteres se divide en bloques y luego se guntan los lemmas
#			print("Libro grande")