Example #1
0
def imdb(dbcollection):
    fnames = ['aclImdb_v1.tar.gz']

    regex_fname = re.compile(
        r'aclImdb/(train|test)/(pos|neg|unsup)/\d+_\d+.txt$')
    extrax_regexes = []

    for fname in fnames:
        extract_documents(dbcollection=dbcollection,
                          regex_fname=regex_fname,
                          fname=fname,
                          extrax_regexes=extrax_regexes)
Example #2
0
def enwik8(dbcollection):
    fnames = ['enwik8_MattMahoney.tar.gz']

    regex_fname = re.compile(r'enwik8\/wiki_\d\d')
    regex_xml_tag = re.compile(r"<doc\s.*>|<\/doc>|<br.*>|\[\[.*\]\]|<pre>",
                               re.IGNORECASE)
    repl = " "
    extrax_regexes = [(regex_xml_tag, repl)]

    for fname in fnames:
        extract_documents(fname=fname,
                          dbcollection=dbcollection,
                          regex_fname=regex_fname,
                          extrax_regexes=extrax_regexes)
Example #3
0
def blog(dbcollection):
    fnames = ['bloggercom_mkopper.tar.gz']

    regex_fname = re.compile(
        r'blogs\/(\d+)\.(male|female)\.[2-9][4-9]\.(.+)\.(\w+)\.xml')
    regex_xml_tag = re.compile(
        r"<date>.*</date>|<blog>|<\/blog>|<post>|<\/post>", re.IGNORECASE)
    repl = " "
    extrax_regexes = [(regex_xml_tag, repl)]

    for fname in fnames:
        extract_documents(dbcollection=dbcollection,
                          regex_fname=regex_fname,
                          fname=fname,
                          extrax_regexes=extrax_regexes)
Example #4
0
def usenet(dbcollection):
    fname = "/archives/corpus/usenet_westburylab.splits.tar.gz"

    corpus = UsenetCorpus(fname)

    extract_documents(dbcollection, corpus=corpus)