def main(): properties = factbook_utils.read_properties('factbook_ingest.properties') # Input directory input_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/countries_html' print('Input directory: ' + input_dir) # Create output directory. output_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/countries_detag' print('Output directory: ' + output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir) # Get list of HTML files from the input directory. file_names = os.listdir(input_dir) # Process each .html file ext = ".html" counter = 0 for file_name in file_names: # Files we are interested in are named: 'xx.html' if len(file_name) is not 7: print('Skipping ' + file_name) continue # Files must end in '.html' if file_name.lower().endswith(ext): infile = os.path.join(input_dir, file_name) outfile = file_name[:2] + ".xml" outfile = os.path.join(output_dir, outfile) process_file(infile, outfile) counter += 1 print('Processed ' + str(counter) + ' files')
def main(): properties = factbook_utils.read_properties("factbook_ingest.properties") input_zipfile = ( properties["corpus.dir.root"] + "/input/" + properties["corpus.abbreviation"] + "-" + properties["corpus.date"] + "/factbook.zip" ) output_dir = ( properties["corpus.dir.root"] + "/intermediate/" + properties["corpus.abbreviation"] + "-" + properties["corpus.date"] + "/countries_html" ) zip = zipfile.ZipFile(input_zipfile, "r") # extract country files for member in zip.namelist(): filename = os.path.basename(member) dirname = os.path.basename(os.path.dirname(member)) # With the latest 2014 version of Factbook, the countrytemplate files no longer exist # We will unzip any HTML files in the geos directory (updated code on 2/24/2014) # if (filename.startswith('countrytemplate') if ( filename.endswith(".html") # ignore the following countries, because the files contain no data and not dirname.find("geos") < 0 and not filename.endswith("_hq.html") and not filename.endswith("_jq.html") and not filename.endswith("_mq.html") and not filename.endswith("_dq.html") and not filename.endswith("_lq.html") and not filename.endswith("_kq.html") and not filename.endswith("_fq.html") and not filename.endswith("_va.html") and not filename.endswith("_ss.html") and not filename.endswith("_xx.html") ): zip.extract("factbook/geos/" + filename, output_dir) zip.close() # move country files to correct location src_dir = output_dir + "/factbook/geos/" file_names = os.listdir(src_dir) for file_name in file_names: short_name = file_name.replace("countrytemplate_", "") shutil.move(src_dir + file_name, output_dir + "/" + short_name) os.rmdir(output_dir + "/factbook/geos") os.rmdir(output_dir + "/factbook") print("Done.")
def main(): properties = factbook_utils.read_properties('factbook_ingest.properties') #/apps/preproc/r1.5.0/factbook/intermediate/fb-20130218/countries_detag input_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/countries_detag' output_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/xml-splitTrecTrim' if not os.path.exists(output_dir): os.makedirs(output_dir) file_names = os.listdir(input_dir) docno_counter = 0 file_names.sort() for file_name in file_names: infile = open(input_dir + '/' + file_name) outfile = open(output_dir + '/' + file_name, 'w') process_file(infile, outfile, docno_counter) docno_counter += 1 print('Done')