def imdb(dbcollection): fnames = ['aclImdb_v1.tar.gz'] regex_fname = re.compile( r'aclImdb/(train|test)/(pos|neg|unsup)/\d+_\d+.txt$') extrax_regexes = [] for fname in fnames: extract_documents(dbcollection=dbcollection, regex_fname=regex_fname, fname=fname, extrax_regexes=extrax_regexes)
def enwik8(dbcollection): fnames = ['enwik8_MattMahoney.tar.gz'] regex_fname = re.compile(r'enwik8\/wiki_\d\d') regex_xml_tag = re.compile(r"<doc\s.*>|<\/doc>|<br.*>|\[\[.*\]\]|<pre>", re.IGNORECASE) repl = " " extrax_regexes = [(regex_xml_tag, repl)] for fname in fnames: extract_documents(fname=fname, dbcollection=dbcollection, regex_fname=regex_fname, extrax_regexes=extrax_regexes)
def blog(dbcollection): fnames = ['bloggercom_mkopper.tar.gz'] regex_fname = re.compile( r'blogs\/(\d+)\.(male|female)\.[2-9][4-9]\.(.+)\.(\w+)\.xml') regex_xml_tag = re.compile( r"<date>.*</date>|<blog>|<\/blog>|<post>|<\/post>", re.IGNORECASE) repl = " " extrax_regexes = [(regex_xml_tag, repl)] for fname in fnames: extract_documents(dbcollection=dbcollection, regex_fname=regex_fname, fname=fname, extrax_regexes=extrax_regexes)
def usenet(dbcollection): fname = "/archives/corpus/usenet_westburylab.splits.tar.gz" corpus = UsenetCorpus(fname) extract_documents(dbcollection, corpus=corpus)