def main(argv): config_file = os.path.join('..', 'config.ini') config = configparser.ConfigParser() config.read(config_file) for iso_639_3 in config.options("LanguagesISOMap"): iso_639_1 = config.get("LanguagesISOMap", iso_639_3) wiki_prefix = "{0}wiki".format(iso_639_1) new_wiki_prefix = "{0}wiki".format(iso_639_3) # finds out if the language currently dealt with in the for loop is part of the list of languages to prepare # this also means: iso_639_3 code of language to prepare has to be in config file as well as it has to be the folder name if not new_wiki_prefix in languages: print("Skipping {0}, since it does not belong to the languages to prepare".format(new_wiki_prefix)) continue print("Processing wikipedia {0} -> {1}...".format(iso_639_1, iso_639_3)) url = "http://dumps.wikimedia.org/backup-index.html" html_page = urllib2.urlopen(url) soup = BeautifulSoup(html_page) page = None for link in soup('a'): if link.string == wiki_prefix: page = urlparse.urljoin(url, link['href']) # get the link for the dump file wiki_date, dump_link = helpers.dump_link(wiki_prefix, page) if not dump_link: sys.stderr.write("Could not find dump link. Abort.") sys.exit(1) # check if there is already build for this Wikipedia dump #output_file = os.path.join( #'..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix, #wiki_date)) #if os.path.exists(output_file): # print("Output file already exists. Skipping.") # continue # download dump print("Downloading {0}...".format(dump_link)) file_path = helpers.download_dump(dump_link, wiki_prefix, new_wiki_prefix) print("Running WikiExtractor...") helpers.wikipedia_extractor(file_path, new_wiki_prefix) # Concatenate output files helpers.concatenate(new_wiki_prefix) # Calling clean scripts print("Cleaning...") helpers.clean_1(new_wiki_prefix)
def main(argv): config_file = os.path.join('..', 'config.ini') config = configparser.ConfigParser() config.read(config_file) for iso_639_3 in config.options("LanguagesISOMap"): iso_639_1 = config.get("LanguagesISOMap", iso_639_3) wiki_prefix = "{0}wiki".format(iso_639_1) new_wiki_prefix = "{0}wiki".format(iso_639_3) print("Processing wikipedia {0} -> {1}...".format( iso_639_1, iso_639_3)) # check if we already have a clean script for this language if not os.path.exists(os.path.join(new_wiki_prefix, "clean2.py")): print("No clean script found. Skipping.") continue url = "http://dumps.wikimedia.org/backup-index.html" html_page = urllib2.urlopen(url) soup = BeautifulSoup(html_page) page = None for link in soup('a'): if link.string == wiki_prefix: page = urlparse.urljoin(url, link['href']) # get the link for the dump file wiki_date, dump_link = helpers.dump_link(wiki_prefix, page) if not dump_link: sys.stderr.write("Could not find dump link. Abort.") sys.exit(1) # check if there is already build for this Wikipedia dump output_file = os.path.join( '..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix, wiki_date)) #if os.path.exists(output_file): # print("Output file already exists. Skipping.") # continue # download dump print("Downloading {0}...".format(dump_link)) file_path = helpers.download_dump(dump_link, wiki_prefix, new_wiki_prefix) print("Running WikiExtractor...") helpers.wikipedia_extractor(file_path, new_wiki_prefix) # Concatenate output files helpers.concatenate(new_wiki_prefix) # Calling clean scripts print("Cleaning...") helpers.clean_1(new_wiki_prefix) os.system("{0} {1}/clean2.py {2} {3}".format( sys.executable, new_wiki_prefix, os.path.join(new_wiki_prefix, "{0}_cleaned1.xml".format(new_wiki_prefix)), os.path.join(new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)))) os.system("{0} clean3.py {1} {2}".format( sys.executable, os.path.join(new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)), os.path.join(new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)))) print("Converting to GrAF...") os.system("{0} to_graf.py {1} {2}".format( sys.executable, os.path.join(new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)), os.path.join(new_wiki_prefix, "{0}-{1}.hdr".format(new_wiki_prefix, wiki_date)))) # Zipping print("Zipping...") files = [ os.path.join(new_wiki_prefix, "{0}-{1}.hdr".format(new_wiki_prefix, wiki_date)), os.path.join(new_wiki_prefix, "{0}-{1}.txt".format(new_wiki_prefix, wiki_date)), os.path.join(new_wiki_prefix, "{0}-{1}-doc.xml".format(new_wiki_prefix, wiki_date)) ] myzip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) for f in files: myzip.write(f, os.path.basename(f)) myzip.write("LICENSE.wikipedia", "LICENSE") myzip.close() print
url = "http://dumps.wikimedia.org/backup-index.html" html_page = urllib2.urlopen(url) soup = BeautifulSoup(html_page) lang_pages = [(link.string, urlparse.urljoin(url, link['href'])) for l in languages for link in soup('a') if link.string == l] for wiki_name, page in lang_pages: wiki_date, dump_link = helpers.dump_link(wiki_name, page) if not dump_link: print("Could not find dump link for {0}.".format(wiki_name)) sys.exit(1) print("Downloading {0}...".format(dump_link)) file_path = helpers.download_dump(dump_link, wiki_name) helpers.wikipedia_extractor(file_path, wiki_name) # Concatenate output files helpers.concatenate(wiki_name) # Calling first clean script print("Cleaning...") helpers.clean_1(wiki_name)
print("There are no languages to prepare that match your request...") sys.exit(1) else: print("Preparing the following languages: {0} ...".format(languages)) url = "http://dumps.wikimedia.org/backup-index.html" html_page = urllib2.urlopen(url) soup = BeautifulSoup(html_page) lang_pages = [(link.string, urlparse.urljoin(url, link['href'])) for l in languages for link in soup('a') if link.string == l] for wiki_name, page in lang_pages: wiki_date, dump_link = helpers.dump_link(wiki_name, page) if not dump_link: print("Could not find dump link for {0}.".format(wiki_name)) sys.exit(1) print("Downloading {0}...".format(dump_link)) file_path = helpers.download_dump(dump_link, wiki_name) helpers.wikipedia_extractor(file_path, wiki_name) # Concatenate output files helpers.concatenate(wiki_name) # Calling first clean script print("Cleaning...") helpers.clean_1(wiki_name)
def main(argv): config_file = os.path.join('..', 'config.ini') config = configparser.ConfigParser() config.read(config_file) for iso_639_3 in config.options("LanguagesISOMap"): iso_639_1 = config.get("LanguagesISOMap", iso_639_3) wiki_prefix = "{0}wiki".format(iso_639_1) new_wiki_prefix = "{0}wiki".format(iso_639_3) print("Processing wikipedia {0} -> {1}...".format(iso_639_1, iso_639_3)) # check if we already have a clean script for this language if not os.path.exists(os.path.join(new_wiki_prefix, "clean2.py")): print("No clean script found. Skipping.") continue url = "http://dumps.wikimedia.org/backup-index.html" html_page = urllib2.urlopen(url) soup = BeautifulSoup(html_page) page = None for link in soup('a'): if link.string == wiki_prefix: page = urlparse.urljoin(url, link['href']) # get the link for the dump file wiki_date, dump_link = helpers.dump_link(wiki_prefix, page) if not dump_link: sys.stderr.write("Could not find dump link. Abort.") sys.exit(1) # check if there is already build for this Wikipedia dump output_file = os.path.join( '..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix, wiki_date)) #if os.path.exists(output_file): # print("Output file already exists. Skipping.") # continue # download dump print("Downloading {0}...".format(dump_link)) file_path = helpers.download_dump(dump_link, wiki_prefix, new_wiki_prefix) print("Running WikiExtractor...") helpers.wikipedia_extractor(file_path, new_wiki_prefix) # Concatenate output files helpers.concatenate(new_wiki_prefix) # Calling clean scripts print("Cleaning...") helpers.clean_1(new_wiki_prefix) os.system("{0} {1}/clean2.py {2} {3}".format( sys.executable, new_wiki_prefix, os.path.join( new_wiki_prefix, "{0}_cleaned1.xml".format(new_wiki_prefix)), os.path.join( new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)))) os.system("{0} clean3.py {1} {2}".format( sys.executable, os.path.join( new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)), os.path.join( new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)))) print("Converting to GrAF...") os.system("{0} to_graf.py {1} {2}".format( sys.executable, os.path.join( new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)), os.path.join( new_wiki_prefix, "{0}-{1}.hdr".format( new_wiki_prefix, wiki_date)))) # Zipping print("Zipping...") files = [ os.path.join( new_wiki_prefix, "{0}-{1}.hdr".format( new_wiki_prefix, wiki_date)), os.path.join( new_wiki_prefix, "{0}-{1}.txt".format( new_wiki_prefix, wiki_date)), os.path.join( new_wiki_prefix, "{0}-{1}-doc.xml".format( new_wiki_prefix, wiki_date)) ] myzip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) for f in files: myzip.write(f, os.path.basename(f)) myzip.write("LICENSE.wikipedia", "LICENSE") myzip.close() print
def main(argv): arg_iso = None if len(argv) > 1: arg_iso = argv[1] script_path = os.path.dirname(os.path.realpath(__file__)) os.chdir(script_path) config_file = os.path.join('..', 'config.ini') config = configparser.ConfigParser() config.read(config_file) processed = dict() processed_file = os.path.join('..', 'build', 'processed.pickle') if os.path.exists(processed_file): with open(processed_file, 'rb') as f: processed = pickle.load(f) if 'wikipedia' not in processed: processed['wikipedia'] = dict() for iso_639_3 in config.options("LanguagesISOMap"): if arg_iso and iso_639_3 != arg_iso: continue iso_639_1 = config.get("LanguagesISOMap", iso_639_3) wiki_prefix = "{0}wiki".format(iso_639_1) new_wiki_prefix = "{0}wiki".format(iso_639_3) print("Processing wikipedia {0} -> {1}...".format(iso_639_1, iso_639_3)) # check if we already have a clean script for this language if not os.path.exists(os.path.join(new_wiki_prefix, "clean2.py")): print("No clean script found. Skipping.") continue url = "http://dumps.wikimedia.org/backup-index.html" html_page = requests.get(url) soup = BeautifulSoup(html_page.content) page = None for link in soup('a'): if link.string == wiki_prefix: page = urllib.parse.urljoin(url, link['href']) # get the link for the dump file wiki_date, dump_link = helpers.dump_link(wiki_prefix, page) if not dump_link: sys.stderr.write("Could not find dump link. Abort.") sys.exit(1) # check if there is already build for this Wikipedia dump output_file = os.path.join( '..', 'build', 'corpus', "{0}.zip".format(new_wiki_prefix)) if iso_639_3 in processed['wikipedia'] and \ int(processed['wikipedia'][iso_639_3]) >= int(wiki_date) and \ os.path.exists(output_file): print(" Wikipedia already processed, skipping.") continue # download dump print("Downloading {0}...".format(dump_link)) file_path = helpers.download_dump(dump_link, wiki_prefix, new_wiki_prefix) print("Running WikiExtractor...") helpers.wikipedia_extractor(file_path, new_wiki_prefix) # Concatenate output files helpers.concatenate(new_wiki_prefix) # Calling clean scripts print("Cleaning...") helpers.clean_1(new_wiki_prefix) os.system("{0} {1}/clean2.py {2} {3}".format( sys.executable, new_wiki_prefix, os.path.join( new_wiki_prefix, "{0}_cleaned1.xml".format(new_wiki_prefix)), os.path.join( new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)))) os.system("{0} clean3.py {1} {2}".format( sys.executable, os.path.join( new_wiki_prefix, "{0}_cleaned2.xml".format(new_wiki_prefix)), os.path.join( new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)))) print("Converting to GrAF...") os.system("{0} to_graf.py {1} {2}".format( sys.executable, os.path.join( new_wiki_prefix, "{0}_cleaned3.xml".format(new_wiki_prefix)), os.path.join( new_wiki_prefix, "{0}-{1}.hdr".format( new_wiki_prefix, wiki_date)))) # Zipping print("Zipping...") files = [ os.path.join( new_wiki_prefix, "{0}-{1}.hdr".format( new_wiki_prefix, wiki_date)), os.path.join( new_wiki_prefix, "{0}-{1}.txt".format( new_wiki_prefix, wiki_date)), os.path.join( new_wiki_prefix, "{0}-{1}-doc.xml".format( new_wiki_prefix, wiki_date)) ] myzip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) for f in files: myzip.write(f, os.path.basename(f)) myzip.write("LICENSE.wikipedia", "LICENSE") myzip.close() # Delete all files # print("Cleaning up...") # files.append(file_path) # files.append(os.path.splitext(file_path)[0]) # for i in range(3): # files.append(os.path.join( # new_wiki_prefix, # "{0}_cleaned{1}.xml".format(new_wiki_prefix, i+1))) # files.append(os.path.join( # new_wiki_prefix,"{0}.xml".format(new_wiki_prefix))) # for f in files: # os.remove(f) # shutil.rmtree(os.path.join(new_wiki_prefix, "extracted")) processed['wikipedia'][iso_639_3] = wiki_date with open(processed_file, 'wb') as f: pickle.dump(processed, f)