def train(): # parse XML and load up words print("Loading words from XML files...") sentences = [] files = glob.glob("data/*.xml") i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) break dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 words = [] for sentence in sentences: words.append(nltk.word_tokenize(sentence)) # build a trigram Language Model (using default Good-Turing # smoothing) with the words array print("Building language model...") est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) langModel = NgramModel(3, words, estimator=est) # langModel = NgramModel(3, words) # cPickle.dump(langModel, open("lm.bin", 'wb')) return langModel
def train(): # parse XML and load up words print("Loading words from XML files...") sentences = [] files = glob.glob("data/*.xml") i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) break dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 words = [] for sentence in sentences: words.append(nltk.word_tokenize(sentence)) # build a trigram Language Model (using default Good-Turing # smoothing) with the words array print("Building language model...") est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) langModel = NgramModel(3, words, estimator=est) # langModel = NgramModel(3, words) # cPickle.dump(langModel, open("lm.bin", 'wb')) return langModel
def train(): if os.path.isfile("lm.bin"): return files = glob.glob("data/*.xml") sentences = [] i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 lm = LangModel(3, 0.4, sentences) cPickle.dump(lm, open("lm.bin", "wb"))
def train(): if os.path.isfile("lm.bin"): return files = glob.glob("data/*.xml") sentences = [] i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 lm = LangModel(3, 0.4, sentences) cPickle.dump(lm, open("lm.bin", "wb"))
def buildWordList(corpus_root): stop_words = set(stopwords.words('english')) fileids = '.xml' xmlreader = XMLCorpusReader(corpus_root, fileids) #print(xmlreader) termList = [] for file in os.listdir(corpus_root): if file.endswith(".xml"): terms = Text(xmlreader.words(file)) termList.append(terms) #print(termList) stop_words = set(stopwords.words('english')) terms = word_tokenize(str(termList)) newTerms = [] for w in terms: if (w not in (stop_words) and w.isalpha()): newTerms.append(w) #print(newTerms) return (newTerms) #print(buildWordList(corpus_root))
#Obtener todos los nombres de los documentos texts = glob('conjuntoDatos/ingles/us*') from nltk.corpus.reader import XMLCorpusReader #declaro el corpus de toda la coleccion terminos = [] #Lista de terminos terminosUnicos = [] #Terminos unicos terminosUnicos2 = [] #Terminos unicos terminosProhibidos = [] #obtener el vocabulario de cada documnento for item_path in texts: destino = os.path.basename(item_path) reader = XMLCorpusReader('conjuntoDatos/ingles', destino) palabras = reader.words() #obtengo las palabras palabrasUnicas = sorted( set(palabras)) #obtengo las palabras sin repeticion terminos.extend(palabrasUnicas) terminosUnicos.extend(sorted( set(terminos))) #obtengo las palabras sin repeticion terminosUnicos2 = sorted(set(terminosUnicos)) #Terminos unicos reader = None #Hago null a el apuntador terminos = None terminos = [] a = "'" for x in terminosUnicos2: if a in x: terminosProhibidos.append(x)
def words(self, fileids=None, categories=None): words = [] fileids = self._resolve(fileids, categories) for fileid in fileids: words += XMLCorpusReader.words(self, fileid) return words