def extract_multilingual_documents(inv_dict, langs, text_path, out_path): if not os.path.exists(out_path): os.makedirs(out_path) for lang in langs: if lang not in inv_dict: raise ValueError("Lang %s is not in the dictionary" % lang) docs_created = len(list_files(out_path)) print("%d multilingual documents found." % docs_created) for doc, lang in _doc_generator(text_path, langs): title = _extract_title(doc) if title in inv_dict[lang]: #pass ids = inv_dict[lang][title] for id in ids: target_file = join(out_path, id) + ".xml" if os.path.exists(target_file): _append_doc(target_file, doc, lang) else: _create_doc(target_file, id, doc, lang) docs_created += 1 else: if not re.match('[A-Za-z]+', title): print("Title <%s> for lang <%s> not in dictionary" % (title, lang))
def _doc_generator(text_path, langs): dotspace = re.compile(r'\.(?!\s)') for l, lang in enumerate(langs): print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) lang_dir = join(text_path, lang) split_dirs = list_dirs(lang_dir) for sd, split_dir in enumerate(split_dirs): print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) split_files = list_files(join(lang_dir, split_dir)) for sf, split_file in enumerate(split_files): print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024 * 1024) as fi: while True: doc_lines = list(islice(fi, 3)) if doc_lines: # some sentences are not followed by a space after the dot doc_lines[1] = dotspace.sub('. ', doc_lines[1]) # [workaround] I found html symbol was not treated, and unescaping it now might not help... doc_lines[1] = escape(doc_lines[1].replace( " ", " ")) yield doc_lines, lang else: break
def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): if pickle_name and os.path.exists(pickle_name): print("unpickling %s" % pickle_name) return pickle.load(open(pickle_name, 'rb')) multi_docs = list_files(wiki_multi_path) mling_documents = {l:[] for l in langs} valid_documents = 0 minwords_exception = 0 wrongdoc_exception = 0 for d,multi_doc in enumerate(multi_docs): print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") doc_path = join(wiki_multi_path, multi_doc) try: m_doc = _load_multilang_doc(doc_path, langs, min_words) valid_documents += 1 for l in langs: mling_documents[l].append(m_doc[l]) except MinWordsNotReached: minwords_exception += 1 if deletions: os.remove(doc_path) except WrongDocumentFormat: wrongdoc_exception += 1 if deletions: os.remove(doc_path) if max_documents>0 and valid_documents>=max_documents: break if pickle_name: print("Pickling wikipedia documents object in %s" % pickle_name) pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) return mling_documents
def fetch_RCV1(data_path, split='all'): assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' request = [] labels = set() read_documents = 0 lang = 'en' training_documents = 23149 test_documents = 781265 if split == 'all': split_range = (2286, 810596) expected = training_documents + test_documents elif split == 'train': split_range = (2286, 26150) expected = training_documents else: split_range = (26151, 810596) expected = test_documents global nwords nwords = [] for part in list_files(data_path): if not re.match('\d+\.zip', part): continue target_file = join(data_path, part) assert exists(target_file), \ "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." zipfile = ZipFile(target_file) for xmlfile in zipfile.namelist(): xmlcontent = zipfile.open(xmlfile).read() try: doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) labels.update(doc.categories) request.append(doc) read_documents += 1 except ValueError: print( '\n\tskipping document {} with inconsistent language label: expected language {}' .format(part + '/' + xmlfile, lang)) except (IDRangeException, ExpectedLanguageException) as e: pass print('\r[{}] read {} documents'.format(part, len(request)), end='') if read_documents == expected: break if read_documents == expected: break print() print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) return request, list(labels)
def fetch_RCV2(data_path, languages=None): if not languages: languages = list(RCV2_LANG_DIR.keys()) else: assert set(languages).issubset(set( RCV2_LANG_DIR.keys())), 'languages not in scope' request = [] labels = set() global nwords nwords = [] for lang in languages: path = join(data_path, RCV2_LANG_DIR[lang]) lang_docs_read = 0 for part in list_files(path): target_file = join(path, part) assert exists(target_file), \ "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." zipfile = ZipFile(target_file) for xmlfile in zipfile.namelist(): xmlcontent = zipfile.open(xmlfile).read() try: doc = parse_document(xmlcontent, assert_lang=lang) labels.update(doc.categories) request.append(doc) lang_docs_read += 1 except ValueError: print( '\n\tskipping document {} with inconsistent language label: expected language {}' .format( RCV2_LANG_DIR[lang] + '/' + part + '/' + xmlfile, lang)) except (IDRangeException, ExpectedLanguageException) as e: pass print('\r[{}] read {} documents, {} for language {}'.format( RCV2_LANG_DIR[lang] + '/' + part, len(request), lang_docs_read, lang), end='') print() print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) return request, list(labels)
def fetch_jrcacquis( langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/' ): assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' if not langs: langs = JRC_LANGS else: if isinstance(langs, str): langs = [langs] for l in langs: if l not in JRC_LANGS: raise ValueError( 'Language %s is not among the valid languages in JRC-Acquis v3' % l) if not data_path: data_path = get_data_home() if not os.path.exists(data_path): os.mkdir(data_path) request = [] total_read = 0 for l in langs: file_name = 'jrc-' + l + '.tgz' archive_path = join(data_path, file_name) if not os.path.exists(archive_path): print( "downloading language-specific dataset (once and for all) into %s" % data_path) DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) download_file(DOWNLOAD_URL, archive_path) print("untarring dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) documents_dir = join(data_path, l) print("Reading documents...") read = 0 for dir in list_dirs(documents_dir): year = int(dir) if years == None or year in years: year_dir = join(documents_dir, dir) pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') if os.path.exists(pickle_name): print("loading from file %s" % pickle_name) l_y_documents = pickle.load(open(pickle_name, "rb")) read += len(l_y_documents) else: l_y_documents = [] all_documents = list_files(year_dir) empty = 0 for i, doc_file in enumerate(all_documents): try: jrc_doc = parse_document(join(year_dir, doc_file), year) except ValueError: jrc_doc = None if jrc_doc and (not ignore_unclassified or jrc_doc.categories): l_y_documents.append(jrc_doc) else: empty += 1 if len(all_documents) > 50 and ( (i + 1) % (len(all_documents) / 50) == 0): print('\r\tfrom %s: completed %d%%' % (year_dir, int((i + 1) * 100.0 / len(all_documents))), end='') read += 1 print( '\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i + 1, empty), end='') print("\t\t(Pickling object for future runs in %s)" % pickle_name) pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) request += l_y_documents print("Read %d documents for language %s\n" % (read, l)) total_read += read print("Read %d documents in total" % (total_read)) if parallel == 'force': request = _force_parallel(request, langs) elif parallel == 'avoid': request = random_sampling_avoiding_parallel(request) final_cats = _get_categories(request) if cat_filter: request = _filter_by_category(request, cat_filter) final_cats = _get_categories(request) if cat_threshold > 0: request, final_cats = _filter_by_frequency(request, cat_threshold) if most_frequent != -1 and len(final_cats) > most_frequent: request, final_cats = _most_common(request, most_frequent) return request, final_cats