Example #1
0
def load_corpus(corpus_name):

    corpora = bk._get_corpora()

    filename = corpora.get(corpus_name, "filename")

    if _is_compressed(corpus_name):

        return corpus.Corpus.load(filename)

    return corpus.MaskedCorpus.load(filename)
Example #2
0
def _is_compressed(corpus_name):

    corpora = bk._get_corpora()

    try:

        return corpora.getboolean(corpus_name, "compressed")

    except bk.cfg.NoOptionError:

        return False
Example #3
0
def _get_masking_fns(corpus_name):

    corpora = bk._get_corpora()

    masking_fns = []

    try:

        if corpora.getboolean(corpus_name, "freq1"):

            masking_fns.append(corpus.mask_f1)

    except bk.cfg.NoOptionError:

        pass

    try:

        if corpora.getboolean(corpus_name, "nltk"):

            stop = load_stoplist("nltk")

            masking_fns.append(lambda c: corpus.mask_from_stoplist(c, stop))

    except bk.cfg.NoOptionError:

        pass

    try:

        if corpora.getboolean(corpus_name, "jones"):

            stop = load_stoplist("jones")

            masking_fns.append(lambda c: corpus.mask_from_stoplist(c, stop))

    except bk.cfg.NoOptionError:

        pass

    return masking_fns
Example #4
0
def write_corpus(corpus_name):

    corpora = bk._get_corpora()

    plain_file = corpora.get(corpus_name, "plain_dir")

    vsm_corpus_file = corpora.get(corpus_name, "filename")

    tok = util.MultipleArticleTokenizer(plain_file)

    c = corpus.MaskedCorpus(corpus=tok.words, tok_names=tok.tok_names, tok_data=tok.tok_data)

    masking_fns = _get_masking_fns(corpus_name)

    for fn in masking_fns:

        fn(c)

    if _is_compressed(corpus_name):

        c = c.to_corpus(compress=True)

    c.save(vsm_corpus_file)
Example #5
0
def corpus_names():

    corpora = bk._get_corpora()

    return corpora.sections()