# Natural Language Toolkit: Europarl Corpus Readers # # Copyright (C) 2001-2009 NLTK Project # Author: Nitin Madnani <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT import re from util import LazyCorpusLoader from reader import * # Create a new corpus reader instance for each European language danish = LazyCorpusLoader('europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8') dutch = LazyCorpusLoader('europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8') english = LazyCorpusLoader('europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') finnish = LazyCorpusLoader('europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8')
stored using U{Open Language Archives Community (OLAC) <http://www.language-archives.org/>} metadata records. These records can be accessed using C{nltk.corpus.I{corpus}.olac()}. """ import re from nltk.tokenize import RegexpTokenizer from nltk.tag import simplify_brown_tag, simplify_wsj_tag,\ simplify_alpino_tag, simplify_indian_tag,\ simplify_tag from util import LazyCorpusLoader from reader import * abc = LazyCorpusLoader('abc', PlaintextCorpusReader, r'(?!\.).*\.txt') alpino = LazyCorpusLoader('alpino', AlpinoCorpusReader, tag_mapping_function=simplify_alpino_tag) brown = LazyCorpusLoader('brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d', cat_file='cats.txt', tag_mapping_function=simplify_brown_tag) cess_cat = LazyCorpusLoader('cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf', tag_mapping_function=simplify_tag) cess_esp = LazyCorpusLoader('cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf',