def train(): # parse XML and load up words print("Loading words from XML files...") sentences = [] files = glob.glob("data/*.xml") i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) break dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 words = [] for sentence in sentences: words.append(nltk.word_tokenize(sentence)) # build a trigram Language Model (using default Good-Turing # smoothing) with the words array print("Building language model...") est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) langModel = NgramModel(3, words, estimator=est) # langModel = NgramModel(3, words) # cPickle.dump(langModel, open("lm.bin", 'wb')) return langModel
def doc_path_to_dict(path): directory, fname = os.path.split(path) reader = XMLCorpusReader(directory, fname) doc = reader.xml() try: return process_doc(doc) except ValueError, e: return e.args[0]
def __init__(self, root, fileids): XMLCorpusReader.__init__(self, root, fileids) # framenet corpus sub dirs # sub dir containing the xml files for frames self._frame_dir = "frame" # sub dir containing the xml files for lexical units self._lu_dir = "lu" # sub dir containing the xml files for fulltext annotation files self._fulltext_dir = "fulltext" # Indexes used for faster look-ups self._frame_idx = None self._lu_idx = None self._fulltext_idx = None self._semtypes = None
def train(): if os.path.isfile("lm.bin"): return files = glob.glob("data/*.xml") sentences = [] i = 0 for file in files: if i > 0 and i % 500 == 0: print("%d/%d files loaded, #-sentences: %d" % (i, len(files), len(sentences))) dir, file = file.split("/") reader = XMLCorpusReader(dir, file) sentences.extend(nltk.sent_tokenize(" ".join(reader.words()))) i += 1 lm = LangModel(3, 0.4, sentences) cPickle.dump(lm, open("lm.bin", "wb"))
def Get_text(corpus_root = '/release/', file_IDs = '.*'): #wordlists = PlaintextCorpusReader(corpus_root, '.*') wordlists = XMLCorpusReader(corpus_root, file_IDs) print "processing " + corpus_root + file_IDs raw = wordlists.raw() #print "corpus rawed ..." tokens = nltk.word_tokenize(raw) #print "corpus tokenized ..." text = nltk.Text(tokens) #print "corpus textified ..." simple_md = [word.lower() for word in text if word.isalpha()] #print "corpus lowercased and alphafied ..." simple_md = [word for word in simple_md if word != 'source'] #print "keyword *source* removed ..." print "DONE!" return simple_md
def buildWordList(corpus_root): stop_words = set(stopwords.words('english')) fileids = '.xml' xmlreader = XMLCorpusReader(corpus_root, fileids) #print(xmlreader) termList = [] for file in os.listdir(corpus_root): if file.endswith(".xml"): terms = Text(xmlreader.words(file)) termList.append(terms) #print(termList) stop_words = set(stopwords.words('english')) terms = word_tokenize(str(termList)) newTerms = [] for w in terms: if (w not in (stop_words) and w.isalpha()): newTerms.append(w) #print(newTerms) return (newTerms) #print(buildWordList(corpus_root))
def read_articles(loc_corpus, article_length): texts_online = glob(loc_corpus + '/*/*') texts_magazine = glob(loc_corpus + '/*/*') years_online = [] years_magazine = [] for text in texts_online: if text.endswith('.xml'): years_online += glob(text) for text in texts_magazine: if text.endswith('.xml'): years_magazine += glob(text) reader_online = XMLCorpusReader(loc_corpus, years_online) reader_magazine = XMLCorpusReader(loc_corpus, years_magazine) fileid_list = [] for fileid in reader_magazine.fileids(): fileid_list.append(fileid) for fileid in reader_online.fileids(): fileid_list.append(fileid) articles = [] for fileid in fileid_list: parser = ET.XMLParser(recover=True) tree = ET.parse(fileid, parser=parser) for elem in tree.iter(tag='artikel'): add_article = Article() for metadaten in elem.iter(tag='metadaten'): for id in metadaten.iter(tag='artikel-id'): add_article.id = id.text for metadaten in elem.iter(tag='inhalt'): if metadaten.tag is None: break for child in metadaten.iter(tag='text'): if child.tag is None: break for titel_liste in child.iter(tag='titel-liste'): for title in titel_liste.iter(tag='titel'): add_article.title = title.text article_text = "" for text in child.iter(tag='absatz'): if text.text is None: break if text is not None: article_text += text.text if len(article_text) > article_length: add_article.content = article_text articles.append(add_article) return articles
authlist = [ 'bob herbert', 'david brooks', 'nicholas d. kristof', 'thomas l. friedman', 'paul krugman', 'maureen dowd', 'frank rich', 'verlyn klinkenborg', 'adam cohen', 'lawrence downes' ] roottest = './nyt_corpus/data/2005/**/**/' nottestmode = False authord = defaultdict(list) icount = 0 ncount = 0 acount = 0 for filename in texts: reader = XMLCorpusReader(os.path.dirname(filename), os.path.basename(filename)) xml = reader.xml() ptext = "" desk = "" body = xml.find('body') head = xml.find('head') auth = body.find('body.head').find('byline') for d in head: if d.get("name") == "dsk": desk = d.get("content") if desk == "Editorial Desk": icount += 1 try: if auth is not None: auth = auth.text if auth is not None:
# How to use the Spanish Wordnet in NLTK? from nltk.corpus.reader import XMLCorpusReader reader = XMLCorpusReader(dir, file)
from cStringIO import StringIO #Obtener todos los nombres de los documentos texts = glob('conjuntoDatos/ingles/us*') from nltk.corpus.reader import XMLCorpusReader #declaro el corpus de toda la coleccion terminos = [] #Lista de terminos terminosUnicos = [] #Terminos unicos terminosUnicos2 = [] #Terminos unicos terminosProhibidos = [] #obtener el vocabulario de cada documnento for item_path in texts: destino = os.path.basename(item_path) reader = XMLCorpusReader('conjuntoDatos/ingles', destino) palabras = reader.words() #obtengo las palabras palabrasUnicas = sorted( set(palabras)) #obtengo las palabras sin repeticion terminos.extend(palabrasUnicas) terminosUnicos.extend(sorted( set(terminos))) #obtengo las palabras sin repeticion terminosUnicos2 = sorted(set(terminosUnicos)) #Terminos unicos reader = None #Hago null a el apuntador terminos = None terminos = [] a = "'" for x in terminosUnicos2: if a in x: terminosProhibidos.append(x)
def words(self, fileids=None, categories=None): words = [] fileids = self._resolve(fileids, categories) for fileid in fileids: words += XMLCorpusReader.words(self, fileid) return words
def raw(self, fileids=None, categories=None): return XMLCorpusReader.raw(self, self._resolve(fileids, categories))
def __init__(self, *args, **kwargs): MyCategorizedCorpusReader.__init__(self, kwargs) XMLCorpusReader.__init__(self, *args, **kwargs)