def fetch_reuters21578(data_path=None, subset='train'):
    if data_path is None:
        data_path = os.path.join(get_data_home(), 'reuters21578')
    reuters_pickle_path = os.path.join(data_path, "reuters." + subset + ".pickle")
    if not os.path.exists(reuters_pickle_path):
        parser = ReutersParser(data_path=data_path)
        for filename in glob(os.path.join(data_path, "*.sgm")):
            parser.parse(open(filename, 'rb'))
        # index category names with a unique numerical code (only considering categories with training examples)
        tr_categories = np.unique(np.concatenate([doc['topics'] for doc in parser.tr_docs])).tolist()

        def pickle_documents(docs, subset):
            for doc in docs:
                doc['topics'] = [tr_categories.index(t) for t in doc['topics'] if t in tr_categories]
            pickle_docs = {'categories': tr_categories, 'documents': docs}
            pickle.dump(pickle_docs, open(os.path.join(data_path, "reuters." + subset + ".pickle"), 'wb'),
                        protocol=pickle.HIGHEST_PROTOCOL)
            return pickle_docs

        pickle_tr = pickle_documents(parser.tr_docs, "train")
        pickle_te = pickle_documents(parser.te_docs, "test")
        # self.sout('Empty docs %d' % parser.empty_docs)
        requested_subset = pickle_tr if subset == 'train' else pickle_te
    else:
        requested_subset = pickle.load(open(reuters_pickle_path, 'rb'))

    data = [(u'{title}\n{body}\n{unproc}'.format(**doc), doc['topics']) for doc in requested_subset['documents']]
    text_data, topics = zip(*data)
    return LabelledDocuments(data=text_data, target=topics, target_names=requested_subset['categories'])
Exemple #2
0
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7):
    _dataname = 'ohsumed50k'
    if data_path is None:
        data_path = join(os.path.expanduser('~'), _dataname)
    create_if_not_exist(data_path)

    pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle')
    if not os.path.exists(pickle_file):
        DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz')
        archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz')
        download_file_if_not_exists(DOWNLOAD_URL, archive_path)
        untardir = 'ohsumed-all'
        if not os.path.exists(os.path.join(data_path, untardir)):
            print("untarring ohsumed...")
            tarfile.open(archive_path, 'r:gz').extractall(data_path)

        target_names = []
        doc_classes = dict()
        class_docs = dict()
        content = dict()
        doc_ids = set()
        for cat_id in os.listdir(join(data_path, untardir)):
            target_names.append(cat_id)
            class_docs[cat_id] = []
            for doc_id in os.listdir(join(data_path, untardir, cat_id)):
                doc_ids.add(doc_id)
                text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read()
                if doc_id not in doc_classes: doc_classes[doc_id] = []
                doc_classes[doc_id].append(cat_id)
                if doc_id not in content: content[doc_id] = text_content
                class_docs[cat_id].append(doc_id)
        target_names.sort()
        print('Read %d different documents' % len(doc_ids))

        splitdata = dict({'train': [], 'test': []})
        for cat_id in target_names:
            free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])]
            if len(free_docs) > 0:
                split_point = int(math.floor(len(free_docs) * train_test_split))
                splitdata['train'].extend(free_docs[:split_point])
                splitdata['test'].extend(free_docs[split_point:])
        for split in ['train', 'test']:
            dataset = LabelledDocuments([], [], target_names)
            for doc_id in splitdata[split]:
                dataset.data.append(content[doc_id])
                dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]])
            pickle.dump(dataset,
                        open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'),
                        protocol=pickle.HIGHEST_PROTOCOL)

    print(pickle_file)
    return pickle.load(open(pickle_file, 'rb'))
Exemple #3
0
def fetch_RCV1(data_path, subset='all'):

    assert subset in ['train', 'test', 'all'
                      ], 'split should either be "train", "test", or "all"'

    request = []
    labels = set()
    read_documents = 0

    training_documents = 23149
    test_documents = 781265

    if subset == 'all':
        split_range = (2286, 810596)
        expected = training_documents + test_documents
    elif subset == 'train':
        split_range = (2286, 26150)
        expected = training_documents
    else:
        split_range = (26151, 810596)
        expected = test_documents

    # global nwords
    # nwords=[]
    for part in list_files(data_path):
        if not re.match('\d+\.zip', part): continue
        target_file = join(data_path, part)
        assert exists(target_file), \
            "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
            " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
        zipfile = ZipFile(target_file)
        for xmlfile in zipfile.namelist():
            xmlcontent = zipfile.open(xmlfile).read()
            try:
                doc = parse_document(xmlcontent, valid_id_range=split_range)
                labels.update(doc.categories)
                request.append(doc)
                read_documents += 1
            except (IDRangeException, ValueError) as e:
                pass
            print('\r[{}] read {} documents'.format(part, len(request)),
                  end='')
            if read_documents == expected: break
        if read_documents == expected: break

    print()
    # print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))

    return LabelledDocuments(data=[d.text for d in request],
                             target=[d.categories for d in request],
                             target_names=list(labels))
Exemple #4
0
def fetch_IMDB(subset, data_home='../datasets/IMDB'):
    assert subset in ['train',
                      'test'], 'subset should either be "train" or "test"'
    data_path = os.path.join(data_home, 'aclImdb_v1')
    data_tar = f'{data_path}.tar.gz'

    if not os.path.exists(data_path):
        download_file_if_not_exists(IMDB_URL, data_tar)
        tarfile.open(data_tar, 'r:gz').extractall(data_path)

    dataset = LabelledDocuments(data=[],
                                target=[],
                                target_names=['pos', 'neg'])
    for label in ['pos', 'neg']:
        path = f'{data_path}/aclImdb/{subset}/{label}'
        docs = [
            open(os.path.join(path, file)).read() for file in list_files(path)
        ]
        dataset.data.extend(docs)
        dataset.target.extend([1 if label == 'pos' else 0] * len(docs))

    dataset.target = np.asarray(dataset.target)

    return dataset