コード例 #1
0
def train():
  # parse XML and load up words
  print("Loading words from XML files...")
  sentences = []
  files = glob.glob("data/*.xml")
  i = 0
  for file in files:
    if i > 0 and i % 500 == 0:
      print("%d/%d files loaded, #-sentences: %d" %
        (i, len(files), len(sentences)))
      break
    dir, file = file.split("/")
    reader = XMLCorpusReader(dir, file)
    sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
    i += 1
  words = []
  for sentence in sentences:
    words.append(nltk.word_tokenize(sentence))
  # build a trigram Language Model (using default Good-Turing
  # smoothing) with the words array
  print("Building language model...")
  est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
  langModel = NgramModel(3, words, estimator=est)
#  langModel = NgramModel(3, words)
#  cPickle.dump(langModel, open("lm.bin", 'wb'))
  return langModel
コード例 #2
0
def train():
    # parse XML and load up words
    print("Loading words from XML files...")
    sentences = []
    files = glob.glob("data/*.xml")
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
            break
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    words = []
    for sentence in sentences:
        words.append(nltk.word_tokenize(sentence))
    # build a trigram Language Model (using default Good-Turing
    # smoothing) with the words array
    print("Building language model...")
    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    langModel = NgramModel(3, words, estimator=est)
    #  langModel = NgramModel(3, words)
    #  cPickle.dump(langModel, open("lm.bin", 'wb'))
    return langModel
コード例 #3
0
def train():
    if os.path.isfile("lm.bin"):
        return
    files = glob.glob("data/*.xml")
    sentences = []
    i = 0
    for file in files:
        if i > 0 and i % 500 == 0:
            print("%d/%d files loaded, #-sentences: %d" %
                  (i, len(files), len(sentences)))
        dir, file = file.split("/")
        reader = XMLCorpusReader(dir, file)
        sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
        i += 1
    lm = LangModel(3, 0.4, sentences)
    cPickle.dump(lm, open("lm.bin", "wb"))
コード例 #4
0
def train():
  if os.path.isfile("lm.bin"):
    return
  files = glob.glob("data/*.xml")
  sentences = []
  i = 0
  for file in files:
    if i > 0 and i % 500 == 0:
      print("%d/%d files loaded, #-sentences: %d" %
        (i, len(files), len(sentences)))
    dir, file = file.split("/")
    reader = XMLCorpusReader(dir, file)
    sentences.extend(nltk.sent_tokenize(" ".join(reader.words())))
    i += 1
  lm = LangModel(3, 0.4, sentences)
  cPickle.dump(lm, open("lm.bin", "wb"))
コード例 #5
0
def buildWordList(corpus_root):
    stop_words = set(stopwords.words('english'))

    fileids = '.xml'
    xmlreader = XMLCorpusReader(corpus_root, fileids)
    #print(xmlreader)
    termList = []
    for file in os.listdir(corpus_root):
        if file.endswith(".xml"):
            terms = Text(xmlreader.words(file))
            termList.append(terms)

    #print(termList)
    stop_words = set(stopwords.words('english'))
    terms = word_tokenize(str(termList))
    newTerms = []
    for w in terms:
        if (w not in (stop_words) and w.isalpha()):
            newTerms.append(w)
    #print(newTerms)
    return (newTerms)


#print(buildWordList(corpus_root))
コード例 #6
0
#Obtener todos los nombres de los documentos
texts = glob('conjuntoDatos/ingles/us*')
from nltk.corpus.reader import XMLCorpusReader

#declaro el corpus de toda la coleccion
terminos = []  #Lista de terminos
terminosUnicos = []  #Terminos unicos
terminosUnicos2 = []  #Terminos unicos
terminosProhibidos = []

#obtener el vocabulario de cada documnento
for item_path in texts:
    destino = os.path.basename(item_path)
    reader = XMLCorpusReader('conjuntoDatos/ingles', destino)
    palabras = reader.words()  #obtengo las palabras
    palabrasUnicas = sorted(
        set(palabras))  #obtengo las palabras sin repeticion
    terminos.extend(palabrasUnicas)
    terminosUnicos.extend(sorted(
        set(terminos)))  #obtengo las palabras sin repeticion
    terminosUnicos2 = sorted(set(terminosUnicos))  #Terminos unicos
    reader = None  #Hago null a el apuntador
    terminos = None
    terminos = []

a = "'"
for x in terminosUnicos2:
    if a in x:
        terminosProhibidos.append(x)
コード例 #7
0
 def words(self, fileids=None, categories=None):
     words = []
     fileids = self._resolve(fileids, categories)
     for fileid in fileids:
         words += XMLCorpusReader.words(self, fileid)
     return words