Exemple #1
0
def get_newsgroups():
    """Retrieves the 20 newsgroups dataset"""
    datadir = '/local/jlund3/data/'

    news_glob = datadir + 'newsgroups-dedup/*/*'
    engl_stop = datadir + 'stopwords/english.txt'
    news_stop = datadir + 'stopwords/newsgroups.txt'
    name_stop = datadir + 'stopwords/malenames.txt'

    dataset = ankura.read_glob(news_glob,
                               tokenizer=ankura.tokenize.news,
                               labeler=ankura.label.title_dirname)
    dataset = ankura.filter_stopwords(dataset, engl_stop)
    dataset = ankura.filter_stopwords(dataset, news_stop)
    dataset = ankura.combine_words(dataset, name_stop, '<name>')
    dataset = ankura.filter_rarewords(dataset, 100)
    dataset = ankura.filter_commonwords(dataset, 1500)

    for doc, title in enumerate(dataset.titles):
        title = title[len(datadir + 'newsgroups-dedup/'):]
        dataset.titles[doc] = title
        outpath = os.path.join(datadir, 'newsgroups-processed', title)
        try:
            os.makedirs(os.path.dirname(outpath))
        except FileExistsError:
            pass

        with open(outpath, 'w') as outfile:
            tokens = [dataset.vocab[v] for v in dataset.doc_tokens(doc)]
            print(' '.join(tokens), file=outfile)

    return dataset
Exemple #2
0
def get_newsgroups():
    """Retrieves the 20 newsgroups dataset"""
    datadir = '/local/jlund3/data/'

    news_glob = datadir + 'newsgroups-dedup/*/*'
    engl_stop = datadir + 'stopwords/english.txt'
    news_stop = datadir + 'stopwords/newsgroups.txt'
    name_stop = datadir + 'stopwords/malenames.txt'

    dataset = ankura.read_glob(news_glob,
                               tokenizer=ankura.tokenize.news,
                               labeler=ankura.label.title_dirname)
    dataset = ankura.filter_stopwords(dataset, engl_stop)
    dataset = ankura.filter_stopwords(dataset, news_stop)
    dataset = ankura.combine_words(dataset, name_stop, '<name>')
    dataset = ankura.filter_rarewords(dataset, 100)
    dataset = ankura.filter_commonwords(dataset, 1500)

    for doc, title in enumerate(dataset.titles):
        title = title[len(datadir + 'newsgroups-dedup/'):]
        dataset.titles[doc] = title
        outpath = os.path.join(datadir, 'newsgroups-processed', title)
        try:
            os.makedirs(os.path.dirname(outpath))
        except FileExistsError:
            pass

        with open(outpath, 'w') as outfile:
            tokens = [dataset.vocab[v] for v in dataset.doc_tokens(doc)]
            print(' '.join(tokens), file=outfile)

    return dataset
Exemple #3
0
def get_amazon():
    """Retrieves the amazon dataset"""
    text_path = os.path.join(args.data_prefix, 'amazon', 'amazon.txt')
    engl_stop =  os.path.join(args.data_prefix, 'stopwords/english.txt')
    curse_stop = os.path.join(args.data_prefix, 'stopwords/profanity.txt')

    dataset = ankura.read_file(text_path, labeler=label.text)
    dataset = ankura.filter_stopwords(dataset, engl_stop)
    dataset = ankura.combine_words(dataset, curse_stop, '<profanity>')
    dataset = ankura.filter_rarewords(dataset, 150)
    dataset = ankura.filter_commonwords(dataset, 3000)
    dataset = find_display_candidates(dataset, curse_stop)

    return dataset
Exemple #4
0
def get_amazon():
    """Retrieves the amazon dataset"""
    text_path = os.path.join(args.data_prefix, 'amazon', 'amazon.txt')
    engl_stop = os.path.join(args.data_prefix, 'stopwords/english.txt')
    curse_stop = os.path.join(args.data_prefix, 'stopwords/profanity.txt')

    dataset = ankura.read_file(text_path, labeler=label.text)
    dataset = ankura.filter_stopwords(dataset, engl_stop)
    dataset = ankura.combine_words(dataset, curse_stop, '<profanity>')
    dataset = ankura.filter_rarewords(dataset, 150)
    dataset = ankura.filter_commonwords(dataset, 3000)
    dataset = find_display_candidates(dataset, curse_stop)

    return dataset
Exemple #5
0
def get_newsgroups():
    """Retrieves the 20 newsgroups dataset"""
    datadir = '/local/jlund3/data/'

    news_glob = datadir + 'newsgroups-dedup/*/*'
    engl_stop = datadir + 'stopwords/english.txt'
    news_stop = datadir + 'stopwords/newsgroups.txt'
    name_stop = datadir + 'stopwords/malenames.txt'

    dataset = ankura.read_glob(news_glob,
                               tokenizer=ankura.tokenize.news,
                               labeler=ankura.label.title_dirname)
    dataset = ankura.filter_stopwords(dataset, engl_stop)
    dataset = ankura.filter_stopwords(dataset, news_stop)
    dataset = ankura.combine_words(dataset, name_stop, '<name>')
    dataset = ankura.filter_rarewords(dataset, 100)
    dataset = ankura.filter_commonwords(dataset, 1500)

    return dataset
Exemple #6
0
def get_newsgroups():
    """Retrieves the 20 newsgroups dataset"""
    news_glob =  os.path.join(args.data_prefix, 'newsgroups/*/*')
    engl_stop =  os.path.join(args.data_prefix, 'stopwords/english.txt')
    news_stop =  os.path.join(args.data_prefix, 'stopwords/newsgroups.txt')
    name_stop =  os.path.join(args.data_prefix, 'stopwords/malenames.txt')
    curse_stop = os.path.join(args.data_prefix, 'stopwords/profanity.txt')
    news_text = functools.partial(label.text, formatter=label.news_formatter)

    dataset = ankura.read_glob(news_glob, tokenizer=ankura.tokenize.news,
                                          labeler=news_text)
    dataset = ankura.filter_stopwords(dataset, engl_stop)
    dataset = ankura.filter_stopwords(dataset, news_stop)
    dataset = ankura.combine_words(dataset, name_stop, '<name>')
    dataset = ankura.combine_words(dataset, curse_stop, '<profanity>')
    dataset = ankura.filter_rarewords(dataset, 100)
    dataset = ankura.filter_commonwords(dataset, 1500)
    dataset = find_display_candidates(dataset, curse_stop)

    return dataset
Exemple #7
0
def get_newsgroups():
    """Retrieves the 20 newsgroups dataset"""
    news_glob = os.path.join(args.data_prefix, 'newsgroups/*/*')
    engl_stop = os.path.join(args.data_prefix, 'stopwords/english.txt')
    news_stop = os.path.join(args.data_prefix, 'stopwords/newsgroups.txt')
    name_stop = os.path.join(args.data_prefix, 'stopwords/malenames.txt')
    curse_stop = os.path.join(args.data_prefix, 'stopwords/profanity.txt')
    news_text = functools.partial(label.text, formatter=label.news_formatter)

    dataset = ankura.read_glob(news_glob,
                               tokenizer=ankura.tokenize.news,
                               labeler=news_text)
    dataset = ankura.filter_stopwords(dataset, engl_stop)
    dataset = ankura.filter_stopwords(dataset, news_stop)
    dataset = ankura.combine_words(dataset, name_stop, '<name>')
    dataset = ankura.combine_words(dataset, curse_stop, '<profanity>')
    dataset = ankura.filter_rarewords(dataset, 100)
    dataset = ankura.filter_commonwords(dataset, 1500)
    dataset = find_display_candidates(dataset, curse_stop)

    return dataset