def get_newsgroups(): """Retrieves the 20 newsgroups dataset""" datadir = '/local/jlund3/data/' news_glob = datadir + 'newsgroups-dedup/*/*' engl_stop = datadir + 'stopwords/english.txt' news_stop = datadir + 'stopwords/newsgroups.txt' name_stop = datadir + 'stopwords/malenames.txt' dataset = ankura.read_glob(news_glob, tokenizer=ankura.tokenize.news, labeler=ankura.label.title_dirname) dataset = ankura.filter_stopwords(dataset, engl_stop) dataset = ankura.filter_stopwords(dataset, news_stop) dataset = ankura.combine_words(dataset, name_stop, '<name>') dataset = ankura.filter_rarewords(dataset, 100) dataset = ankura.filter_commonwords(dataset, 1500) for doc, title in enumerate(dataset.titles): title = title[len(datadir + 'newsgroups-dedup/'):] dataset.titles[doc] = title outpath = os.path.join(datadir, 'newsgroups-processed', title) try: os.makedirs(os.path.dirname(outpath)) except FileExistsError: pass with open(outpath, 'w') as outfile: tokens = [dataset.vocab[v] for v in dataset.doc_tokens(doc)] print(' '.join(tokens), file=outfile) return dataset
def get_newsgroups(): """Retrieves the 20 newsgroups dataset""" datadir = '/local/jlund3/data/' news_glob = datadir + 'newsgroups-dedup/*/*' engl_stop = datadir + 'stopwords/english.txt' news_stop = datadir + 'stopwords/newsgroups.txt' name_stop = datadir + 'stopwords/malenames.txt' dataset = ankura.read_glob(news_glob, tokenizer=ankura.tokenize.news, labeler=ankura.label.title_dirname) dataset = ankura.filter_stopwords(dataset, engl_stop) dataset = ankura.filter_stopwords(dataset, news_stop) dataset = ankura.combine_words(dataset, name_stop, '<name>') dataset = ankura.filter_rarewords(dataset, 100) dataset = ankura.filter_commonwords(dataset, 1500) return dataset
def get_newsgroups(): """Retrieves the 20 newsgroups dataset""" news_glob = os.path.join(args.data_prefix, 'newsgroups/*/*') engl_stop = os.path.join(args.data_prefix, 'stopwords/english.txt') news_stop = os.path.join(args.data_prefix, 'stopwords/newsgroups.txt') name_stop = os.path.join(args.data_prefix, 'stopwords/malenames.txt') curse_stop = os.path.join(args.data_prefix, 'stopwords/profanity.txt') news_text = functools.partial(label.text, formatter=label.news_formatter) dataset = ankura.read_glob(news_glob, tokenizer=ankura.tokenize.news, labeler=news_text) dataset = ankura.filter_stopwords(dataset, engl_stop) dataset = ankura.filter_stopwords(dataset, news_stop) dataset = ankura.combine_words(dataset, name_stop, '<name>') dataset = ankura.combine_words(dataset, curse_stop, '<profanity>') dataset = ankura.filter_rarewords(dataset, 100) dataset = ankura.filter_commonwords(dataset, 1500) dataset = find_display_candidates(dataset, curse_stop) return dataset