def iterator(dataset_fn): """ Provides an iterator of parsed documents from the 20 Newsgroups dataset. :param dataset_fn: Path to Newsgroups dataset archive file. :type dataset_fn: unicode|str :rtype : generator """ ng = datasets.fetch_20newsgroups() for article, group, target, filename in zip( ng['data'], [ng['target_names'][x] for x in ng['target']], ng['target'], ng['filenames']): article = twenty_newsgroups.strip_newsgroup_header(article) article = twenty_newsgroups.strip_newsgroup_footer(article) article = twenty_newsgroups.strip_newsgroup_quoting(article) doc_id = os.path.basename(filename) yield { 'doc_id': doc_id, 'article': article, 'group': group, 'target': target, 'filename': filename }
def iterator(dataset_fn): """ Provides an iterator of parsed documents from the 20 Newsgroups dataset. :param dataset_fn: Path to Newsgroups dataset archive file. :type dataset_fn: unicode|str :rtype : generator """ ng = datasets.fetch_20newsgroups() for article, group, target, filename in zip(ng['data'], [ng['target_names'][x] for x in ng['target']], ng['target'], ng['filenames']): article = twenty_newsgroups.strip_newsgroup_header(article) article = twenty_newsgroups.strip_newsgroup_footer(article) article = twenty_newsgroups.strip_newsgroup_quoting(article) doc_id = os.path.basename(filename) yield {'doc_id': doc_id, 'article': article, 'group': group, 'target': target, 'filename': filename}
def fetch_sraa( DOWNLOAD_URL='http://people.cs.umass.edu/~mccallum/data/sraa.tar.gz', dataset_home='../datasets/SRAA', dopickle=False): """ Fetchs a version of Reuters21578 for cross-domain adaptation, as defined in: Dai, W., Xue, G. R., Yang, Q., & Yu, Y. (2007, August). Co-clustering based classification for out-of-domain documents. In Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining (pp. 210-219). ACM. """ picklepath = join(dataset_home, 'sraa.pkl') if dopickle and exists(picklepath): print('...loading pickle from {}'.format(picklepath)) return pickle.load(open(picklepath, 'rb')) dataset_path = join(dataset_home, 'sraa.tar.gz') if not exists(dataset_path): create_if_not_exist(dataset_home) print("downloading Reuters dataset (once and for all) into %s" % dataset_path) download_file(DOWNLOAD_URL, dataset_path) print("untarring dataset...") tarfile.open(dataset_path, 'r:gz').extractall(dataset_home) sraa = load_files(join(dataset_home, 'sraa'), encoding='latin1') remove = ('headers', 'footers') #, 'quotes') if 'headers' in remove: sraa.data = [strip_newsgroup_header(text) for text in sraa.data] if 'footers' in remove: sraa.data = [strip_newsgroup_footer(text) for text in sraa.data] if 'quotes' in remove: sraa.data = [strip_newsgroup_quoting(text) for text in sraa.data] sraa = _index_by_label(sraa, min_words=10) if dopickle: print('...pickling the dataset into {} to speed-up next calls'.format( picklepath)) pickle.dump(sraa, open(picklepath, 'wb'), pickle.HIGHEST_PROTOCOL) return sraa
def preprocess_input(text): text = strip_newsgroup_header(text) text = strip_newsgroup_quoting(text) text = strip_newsgroup_footer(text) return text
import matplotlib.pyplot as plt import sagemaker import seaborn as sns from sklearn.datasets import fetch_20newsgroups from sklearn.datasets.twenty_newsgroups import strip_newsgroup_header, strip_newsgroup_quoting, strip_newsgroup_footer newsgroups_train = fetch_20newsgroups(subset='train')['data'] newsgroups_test = fetch_20newsgroups(subset='test')['data'] NUM_TOPICS = 30 NUM_NEIGHBORS = 10 BUCKET = 'sagemaker-sbn' PREFIX = '20newsgroups' # In[ ]: for i in range(len(newsgroups_train)): newsgroups_train[i] = strip_newsgroup_header(newsgroups_train[i]) newsgroups_train[i] = strip_newsgroup_quoting(newsgroups_train[i]) newsgroups_train[i] = strip_newsgroup_footer(newsgroups_train[i]) # In[ ]: newsgroups_train[1] # In[ ]: get_ipython().system('pip install nltk') import nltk nltk.download('punkt') nltk.download('wordnet') from nltk import word_tokenize from nltk.stem import WordNetLemmatizer