def fetch_reuters21578(data_path=None, subset='train'): if data_path is None: data_path = os.path.join(get_data_home(), 'reuters21578') reuters_pickle_path = os.path.join(data_path, "reuters." + subset + ".pickle") if not os.path.exists(reuters_pickle_path): parser = ReutersParser(data_path=data_path) for filename in glob(os.path.join(data_path, "*.sgm")): parser.parse(open(filename, 'rb')) # index category names with a unique numerical code (only considering categories with training examples) tr_categories = np.unique(np.concatenate([doc['topics'] for doc in parser.tr_docs])).tolist() def pickle_documents(docs, subset): for doc in docs: doc['topics'] = [tr_categories.index(t) for t in doc['topics'] if t in tr_categories] pickle_docs = {'categories': tr_categories, 'documents': docs} pickle.dump(pickle_docs, open(os.path.join(data_path, "reuters." + subset + ".pickle"), 'wb'), protocol=pickle.HIGHEST_PROTOCOL) return pickle_docs pickle_tr = pickle_documents(parser.tr_docs, "train") pickle_te = pickle_documents(parser.te_docs, "test") # self.sout('Empty docs %d' % parser.empty_docs) requested_subset = pickle_tr if subset == 'train' else pickle_te else: requested_subset = pickle.load(open(reuters_pickle_path, 'rb')) data = [(u'{title}\n{body}\n{unproc}'.format(**doc), doc['topics']) for doc in requested_subset['documents']] text_data, topics = zip(*data) return LabelledDocuments(data=text_data, target=topics, target_names=requested_subset['categories'])
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7): _dataname = 'ohsumed50k' if data_path is None: data_path = join(os.path.expanduser('~'), _dataname) create_if_not_exist(data_path) pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle') if not os.path.exists(pickle_file): DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz') archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz') download_file_if_not_exists(DOWNLOAD_URL, archive_path) untardir = 'ohsumed-all' if not os.path.exists(os.path.join(data_path, untardir)): print("untarring ohsumed...") tarfile.open(archive_path, 'r:gz').extractall(data_path) target_names = [] doc_classes = dict() class_docs = dict() content = dict() doc_ids = set() for cat_id in os.listdir(join(data_path, untardir)): target_names.append(cat_id) class_docs[cat_id] = [] for doc_id in os.listdir(join(data_path, untardir, cat_id)): doc_ids.add(doc_id) text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read() if doc_id not in doc_classes: doc_classes[doc_id] = [] doc_classes[doc_id].append(cat_id) if doc_id not in content: content[doc_id] = text_content class_docs[cat_id].append(doc_id) target_names.sort() print('Read %d different documents' % len(doc_ids)) splitdata = dict({'train': [], 'test': []}) for cat_id in target_names: free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])] if len(free_docs) > 0: split_point = int(math.floor(len(free_docs) * train_test_split)) splitdata['train'].extend(free_docs[:split_point]) splitdata['test'].extend(free_docs[split_point:]) for split in ['train', 'test']: dataset = LabelledDocuments([], [], target_names) for doc_id in splitdata[split]: dataset.data.append(content[doc_id]) dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]]) pickle.dump(dataset, open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL) print(pickle_file) return pickle.load(open(pickle_file, 'rb'))
def fetch_RCV1(data_path, subset='all'): assert subset in ['train', 'test', 'all' ], 'split should either be "train", "test", or "all"' request = [] labels = set() read_documents = 0 training_documents = 23149 test_documents = 781265 if subset == 'all': split_range = (2286, 810596) expected = training_documents + test_documents elif subset == 'train': split_range = (2286, 26150) expected = training_documents else: split_range = (26151, 810596) expected = test_documents # global nwords # nwords=[] for part in list_files(data_path): if not re.match('\d+\.zip', part): continue target_file = join(data_path, part) assert exists(target_file), \ "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." zipfile = ZipFile(target_file) for xmlfile in zipfile.namelist(): xmlcontent = zipfile.open(xmlfile).read() try: doc = parse_document(xmlcontent, valid_id_range=split_range) labels.update(doc.categories) request.append(doc) read_documents += 1 except (IDRangeException, ValueError) as e: pass print('\r[{}] read {} documents'.format(part, len(request)), end='') if read_documents == expected: break if read_documents == expected: break print() # print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) return LabelledDocuments(data=[d.text for d in request], target=[d.categories for d in request], target_names=list(labels))
def fetch_IMDB(subset, data_home='../datasets/IMDB'): assert subset in ['train', 'test'], 'subset should either be "train" or "test"' data_path = os.path.join(data_home, 'aclImdb_v1') data_tar = f'{data_path}.tar.gz' if not os.path.exists(data_path): download_file_if_not_exists(IMDB_URL, data_tar) tarfile.open(data_tar, 'r:gz').extractall(data_path) dataset = LabelledDocuments(data=[], target=[], target_names=['pos', 'neg']) for label in ['pos', 'neg']: path = f'{data_path}/aclImdb/{subset}/{label}' docs = [ open(os.path.join(path, file)).read() for file in list_files(path) ] dataset.data.extend(docs) dataset.target.extend([1 if label == 'pos' else 0] * len(docs)) dataset.target = np.asarray(dataset.target) return dataset