def read_articles(loc_corpus, article_length): texts_online = glob(loc_corpus + '/*/*') texts_magazine = glob(loc_corpus + '/*/*') years_online = [] years_magazine = [] for text in texts_online: if text.endswith('.xml'): years_online += glob(text) for text in texts_magazine: if text.endswith('.xml'): years_magazine += glob(text) reader_online = XMLCorpusReader(loc_corpus, years_online) reader_magazine = XMLCorpusReader(loc_corpus, years_magazine) fileid_list = [] for fileid in reader_magazine.fileids(): fileid_list.append(fileid) for fileid in reader_online.fileids(): fileid_list.append(fileid) articles = [] for fileid in fileid_list: parser = ET.XMLParser(recover=True) tree = ET.parse(fileid, parser=parser) for elem in tree.iter(tag='artikel'): add_article = Article() for metadaten in elem.iter(tag='metadaten'): for id in metadaten.iter(tag='artikel-id'): add_article.id = id.text for metadaten in elem.iter(tag='inhalt'): if metadaten.tag is None: break for child in metadaten.iter(tag='text'): if child.tag is None: break for titel_liste in child.iter(tag='titel-liste'): for title in titel_liste.iter(tag='titel'): add_article.title = title.text article_text = "" for text in child.iter(tag='absatz'): if text.text is None: break if text is not None: article_text += text.text if len(article_text) > article_length: add_article.content = article_text articles.append(add_article) return articles
def train(): # parse XML and load up words print("Loading words from XML files...") sentences = [] files = glob.glob("data/*.xml") i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) break dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 words = [] for sentence in sentences: words.append(nltk.word_tokenize(sentence)) # build a trigram Language Model (using default Good-Turing # smoothing) with the words array print("Building language model...") est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) langModel = NgramModel(3, words, estimator=est) # langModel = NgramModel(3, words) # cPickle.dump(langModel, open("lm.bin", 'wb')) return langModel
def doc_path_to_dict(path): directory, fname = os.path.split(path) reader = XMLCorpusReader(directory, fname) doc = reader.xml() try: return process_doc(doc) except ValueError, e: return e.args[0]
def train(): if os.path.isfile("lm.bin"): return files = glob.glob("data/*.xml") sentences = [] i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 lm = LangModel(3, 0.4, sentences) cPickle.dump(lm, open("lm.bin", "wb"))
def buildWordList(corpus_root): stop_words = set(stopwords.words('english')) fileids = '.xml' xmlreader = XMLCorpusReader(corpus_root, fileids) #print(xmlreader) termList = [] for file in os.listdir(corpus_root): if file.endswith(".xml"): terms = Text(xmlreader.words(file)) termList.append(terms) #print(termList) stop_words = set(stopwords.words('english')) terms = word_tokenize(str(termList)) newTerms = [] for w in terms: if (w not in (stop_words) and w.isalpha()): newTerms.append(w) #print(newTerms) return (newTerms) #print(buildWordList(corpus_root))
authlist = [ 'bob herbert', 'david brooks', 'nicholas d. kristof', 'thomas l. friedman', 'paul krugman', 'maureen dowd', 'frank rich', 'verlyn klinkenborg', 'adam cohen', 'lawrence downes' ] roottest = './nyt_corpus/data/2005/**/**/' nottestmode = False authord = defaultdict(list) icount = 0 ncount = 0 acount = 0 for filename in texts: reader = XMLCorpusReader(os.path.dirname(filename), os.path.basename(filename)) xml = reader.xml() ptext = "" desk = "" body = xml.find('body') head = xml.find('head') auth = body.find('body.head').find('byline') for d in head: if d.get("name") == "dsk": desk = d.get("content") if desk == "Editorial Desk": icount += 1 try: if auth is not None: auth = auth.text if auth is not None:
# How to use the Spanish Wordnet in NLTK? from nltk.corpus.reader import XMLCorpusReader reader = XMLCorpusReader(dir, file)
from cStringIO import StringIO #Obtener todos los nombres de los documentos texts = glob('conjuntoDatos/ingles/us*') from nltk.corpus.reader import XMLCorpusReader #declaro el corpus de toda la coleccion terminos = [] #Lista de terminos terminosUnicos = [] #Terminos unicos terminosUnicos2 = [] #Terminos unicos terminosProhibidos = [] #obtener el vocabulario de cada documnento for item_path in texts: destino = os.path.basename(item_path) reader = XMLCorpusReader('conjuntoDatos/ingles', destino) palabras = reader.words() #obtengo las palabras palabrasUnicas = sorted( set(palabras)) #obtengo las palabras sin repeticion terminos.extend(palabrasUnicas) terminosUnicos.extend(sorted( set(terminos))) #obtengo las palabras sin repeticion terminosUnicos2 = sorted(set(terminosUnicos)) #Terminos unicos reader = None #Hago null a el apuntador terminos = None terminos = [] a = "'" for x in terminosUnicos2: if a in x: terminosProhibidos.append(x)
#Obtener todos los nombres de los documentos texts = glob('conjuntoDatos/espanol/us*') from nltk.corpus.reader import XMLCorpusReader #declaro el corpus de toda la coleccion terminos = [] #Lista de terminos terminosUnicos = [] #Terminos unicos terminosUnicos2 = [] #Terminos unicos terminosProhibidos = [] palabrasTotales = 0 #obtener el vocabulario de cada documnento for item_path in texts: destino = os.path.basename(item_path) reader = XMLCorpusReader('conjuntoDatos/espanol', destino) palabras = reader.words() #obtengo las palabras palabrasTotales = palabrasTotales + len(palabras) palabrasUnicas = sorted( set(palabras)) #obtengo las palabras sin repeticion terminos.extend(palabrasUnicas) terminosUnicos.extend(sorted( set(terminos))) #obtengo las palabras sin repeticion terminosUnicos2 = sorted(set(terminosUnicos)) #Terminos unicos reader = None #Hago null a el apuntador terminos = None terminos = [] a = "'" for x in terminosUnicos2: if a in x: