def parse_regex(self): for key in self.specs.patterns[ 'triples']: ##specs is a dict with regex pattern as key, and order of arguments as value #print(key) matchObj = re.search(key, self.question, re.M | re.I | re.U) if matchObj: #print("expression found") triple = [ TextNormalizer( matchObj.group(1)).allowedTagKeeper('noun_adjective'), TextNormalizer( matchObj.group(2)).allowedTagKeeper('noun_adjective'), "" ] ##instead of complicated regex, i remove everything from a group that is not a noun, we know that only those are meaningful in wikidata IDs ##an empty element is added in the end as a placeholder for the variable, that is obviously not in the text, for the sake of similar indexing with the order in specs #print(triple) T = Triple(triple, self.specs.patterns['triples'][key]) self.variable = T.variable ##set the question variables to be equal to the triple variables TODO: selection in multiple triples self.targetVariable = T.targetVariable self.query_list.append(T.SQL)
text = tcleaner.clean_code(text) text = tcleaner.clean_text(text) tcleaner.output(text) tokenizer = Tokenizer.JanomeTokenizer() words = tokenizer.wakati(text) #words = tokenizer.filter_by_pos(text, pos=('名詞')) tokenizer.output(words) #MeCab #tokenizer = Tokenizer.MeCabTokenizer() #words = tokenizer.wakati(text) #words = tokenizer.filter_by_pos(text, pos=('名詞')) #tokenizer.output(words) tnormalizer = TextNormalizer.TextNormalizer() nwords = [] for w in words: nw = tnormalizer.normalize(w) nw = tnormalizer.lemmatize_term(nw, pos='v') nwords.append(nw) tnormalizer.output(nwords) stw_remover = StopwordRemover.StopwordRemover() stw_remover.load_stopword_file("./slothlib/stopwords.txt") stw_remover.load_stopword_file("./slothlib/stopwords_extend.txt") stw_remover.find_stopwords(nwords) stwords = stw_remover.remove_stopwords(nwords) stwords = stw_remover.remove_noisewords(stwords) stw_remover.output(stwords)
import ModeloVectorial import TextNormalizer import UsualTools import os from ModeloVectorial import * from TextNormalizer import * from UsualTools import * maxLen = 1000000 # Maxima mongitud de caracteres soportados por libro exp = TextNormalizer() llemmas = [] #----------------------------------------------------- Obtiene objetos tipo libros y los guardar ----------------------------------- #books = UsualTools.getLibros("./Libros de Goodreads") #UsualTools.saveObject(books,"./Recursos/BooksList.json") #----------------------------------------------------- books = UsualTools.loadObject("./Recursos/BooksList.json") exp.setVocabulary(UsualTools.loadObject("./Recursos/vocabulary.json")) #----------------------------------------------------- Lemmatiza los libros y los guarda ------------------------------------------- #for book in books: # if (not (str(book.num)+".json" in os.listdir("./Recursos/lemmas/")) ): # print("-----------------") # print("Titulo:",book.nombre) # book.texto = exp.delExtraInfoPG("./Libros de Goodreads/"+str(book.num)+".txt") # book.texto = exp.deleteSpecialChars(book.texto) # booktam = len(book.texto) # lemmas = [] # if (len(book.texto) < maxLen):#1000000 es el numero maximo de caracteres soportada por cada procesamiento # exp.setText(book.texto) # lemmas= exp.lemmatize_delSW() # else:#Si excede el numero de caracteres se divide en bloques y luego se guntan los lemmas # print("Libro grande")