def feed_to_db_files(directory, feed_file): with open(feed_file) as xml_doc: context = etree.iterparse(xml_doc, events=("start", "end")) context = iter(context) event, root = context.next() feed_version = root.attrib["schemaVersion"] sp = SchemaProps(feed_version) db_props = sp.full_header_data("db") def process_element(element, writer): elem_dict, extras = process_db_sub_elems(elem, db_props) try: writer.writerow(elem_dict) except UnicodeEncodeError: for key in elem_dict.keys(): if elem_dict[key]: elem_dict[key] = elem_dict[key].replace(u"\ufffd", "N") elem_dict[key] = elem_dict[key].replace(u'\u201d', "") elem_dict[key] = elem_dict[key].replace(u'\u201c', "") elem_dict[key] = elem_dict[key].replace(u'\u2019', "") elem_dict[key] = elem_dict[key].replace(u'\xbd', "") elem_dict[key] = elem_dict[key].replace(u'\xf3', "") elem_dict[key] = elem_dict[key].replace(u'\xa0', "") elem_dict[key] = elem_dict[key].replace(u'\xe9', "") elem_dict[key] = elem_dict[key].replace(u'\xe1', "") writer.writerow(elem_dict) for extra in extras: with FileWriter( directory, extra["table"], db_props[extra["table"]]) as temp_writer: temp_writer.writerow(extra["elements"]) element_gen = extract_base_elements(context, db_props.keys()) for elem in element_gen: with FileWriter(directory, elem.tag, db_props[elem.tag]) as writer: # process first row to handle lookahead for section ending process_element(elem, writer) if elem.getnext() is None or elem.getnext().tag != elem.tag: continue for elem in element_gen: process_element(elem, writer) if elem.getnext() is None or elem.getnext().tag != elem.tag: break if feed_version != "3.0": update_version(directory, feed_version)
def feed_to_db_files(directory, feed_file): with open(feed_file) as xml_doc: context = etree.iterparse(xml_doc, events=("start", "end")) context = iter(context) event, root = context.next() feed_version = root.attrib["schemaVersion"] sp = SchemaProps(feed_version) db_props = sp.full_header_data("db") e_name = "" for elem in extract_base_elements(context, db_props.keys()): if elem.tag != e_name: e_name = elem.tag writer = file_writer(directory, e_name, db_props[e_name]) elem_dict, extras = process_db_sub_elems(elem, db_props) try: writer.writerow(elem_dict) except UnicodeEncodeError: for key in elem_dict.keys(): if elem_dict[key]: elem_dict[key] = elem_dict[key].replace(u"\ufffd", "N") elem_dict[key] = elem_dict[key].replace(u'\u201d', "") elem_dict[key] = elem_dict[key].replace(u'\u201c', "") elem_dict[key] = elem_dict[key].replace(u'\u2019', "") elem_dict[key] = elem_dict[key].replace(u'\xbd', "") elem_dict[key] = elem_dict[key].replace(u'\xf3', "") elem_dict[key] = elem_dict[key].replace(u'\xa0', "") elem_dict[key] = elem_dict[key].replace(u'\xe9', "") elem_dict[key] = elem_dict[key].replace(u'\xe1', "") writer.writerow(elem_dict) for extra in extras: temp_writer = file_writer(directory, extra["table"], db_props[extra["table"]]) temp_writer.writerow(extra["elements"]) if feed_version != "3.0": update_version(directory, feed_version)
def main(): print "setting up directories..." dt.clear_or_create(DIRECTORIES["temp"]) dt.create_directory(DIRECTORIES["archives"]) print "done setting up directories" ftype = ft.get_type(unpack_file) print "unpacking and flattening files..." unpack.unpack(unpack_file, DIRECTORIES["temp"]) unpack.flatten_folder(DIRECTORIES["temp"]) # I could have flatten_folder return a list of files in the directory, so that # we wouldn't have to search through the directory everytime for specific files # since os.walk is slow with directories with large files print "done unpacking and flattening" sp = SchemaProps(SCHEMA_URL) file_details = {"file":unpack_file, "process_time":process_time, "file_timestamp":file_timestamp} election_details = {} vip_id = None election_id = None print "converting to db style flat files...." if dt.file_by_name(CONFIG_FILE, DIRECTORIES["temp"]): file_details.update(process_config(DIRECTORIES["temp"], DIRECTORIES["temp"] + CONFIG_FILE, sp)) if dt.files_by_extension(".txt", DIRECTORIES["temp"]) > 0: file_details.update(process_flatfiles(DIRECTORIES["temp"], sp)) print "processing xml files..." xml_files = dt.files_by_extension(".xml", DIRECTORIES["temp"]) if len(xml_files) >= 1: ftff.feed_to_db_files(DIRECTORIES["temp"], xml_files[0], sp.full_header_data("db"), sp.version) os.remove(xml_files[0]) if "valid_files" in file_details: file_details["valid_files"].append(xml_files[0]) else: file_details["valid_files"] = [xml_files[0]] print "done processing xml files" print "getting feed details..." db = EasySQL("localhost","vip","username","password") try: with open(DIRECTORIES["temp"] + "source.txt", "r") as f: reader = csv.DictReader(f) row = reader.next() vip_id = row["vip_id"] election_details["vip_id"] = vip_id with open(DIRECTORIES["temp"] + "election.txt", "r") as f: reader = csv.DictReader(f) row = reader.next() election_details["election_date"] = row["date"] election_details["election_type"] = row["election_type"] except: er.report_summary(vip_id, election_id, file_details, election_details) return election_id = get_election_id(election_details, db) election_details["election_id"] = election_id print "done getting feed details" print "converting to full db files...." element_counts, error_data, warning_data = convert_to_db_files(vip_id, election_id, file_details["file_timestamp"], DIRECTORIES["temp"], sp) print "done converting to full db files" er.report_summary(vip_id, election_id, file_details, election_details, element_counts) if len(error_data) > 0: er.feed_issues(vip_id, file_details["file_timestamp"], error_data, "error") if len(warning_data) > 0: er.feed_issues(vip_id, file_details["file_timestamp"], warning_data, "warning") update_data(vip_id, election_id, file_details["file_timestamp"], db, element_counts, DIRECTORIES["temp"], DIRECTORIES["archives"]) db_validations(vip_id, election_id, db, sp) generate_feed(file_details)
username = args.username password = args.password conn = psycopg2.connect(host=host, database=db_name, user=username, password=password) create_db.clear_setup_db(LOCATION, conn) db = EasySQL(host, db_name, username, password) process_dir = path.join(TMP_DIR, "vip_feed") dt.clear_or_create(process_dir) dt.clear_or_create(REPORT_DIR) sp = SchemaProps(SCHEMA_URL) #should have a call to get file type here sometime, and then call the necessary functions #data_type = get_type(DIR) #the db_flat type should not be an issue, because by the time it gets here, the data should have already been converted #if data_type == "db_flat": file_issues = check_db_flat(DATA_DIR, sp) counts, errors, warnings = validation.file_validation( DATA_DIR, process_dir, file_issues["valid_files"], sp) for f in os.listdir(process_dir): with open(path.join(process_dir, f), "r") as r: reader = DictReader(r) db.copy_upload(f.split(".")[0], reader.fieldnames, r.name)
def main(): print "setting up directories..." dt.clear_or_create(DIRECTORIES["temp"]) dt.create_directory(DIRECTORIES["archives"]) print "done setting up directories" ftype = ft.get_type(unpack_file) print "unpacking and flattening files..." unpack.unpack(unpack_file, DIRECTORIES["temp"]) unpack.flatten_folder(DIRECTORIES["temp"]) # I could have flatten_folder return a list of files in the directory, so that # we wouldn't have to search through the directory everytime for specific files # since os.walk is slow with directories with large files print "done unpacking and flattening" sp = SchemaProps(SCHEMA_URL) file_details = { "file": unpack_file, "process_time": process_time, "file_timestamp": file_timestamp } election_details = {} vip_id = None election_id = None print "converting to db style flat files...." if dt.file_by_name(CONFIG_FILE, DIRECTORIES["temp"]): file_details.update( process_config(DIRECTORIES["temp"], DIRECTORIES["temp"] + CONFIG_FILE, sp)) if dt.files_by_extension(".txt", DIRECTORIES["temp"]) > 0: file_details.update(process_flatfiles(DIRECTORIES["temp"], sp)) print "processing xml files..." xml_files = dt.files_by_extension(".xml", DIRECTORIES["temp"]) if len(xml_files) >= 1: ftff.feed_to_db_files(DIRECTORIES["temp"], xml_files[0], sp.full_header_data("db"), sp.version) os.remove(xml_files[0]) if "valid_files" in file_details: file_details["valid_files"].append(xml_files[0]) else: file_details["valid_files"] = [xml_files[0]] print "done processing xml files" print "getting feed details..." db = EasySQL("localhost", "vip", "username", "password") try: with open(DIRECTORIES["temp"] + "source.txt", "r") as f: reader = csv.DictReader(f) row = reader.next() vip_id = row["vip_id"] election_details["vip_id"] = vip_id with open(DIRECTORIES["temp"] + "election.txt", "r") as f: reader = csv.DictReader(f) row = reader.next() election_details["election_date"] = row["date"] election_details["election_type"] = row["election_type"] except: er.report_summary(vip_id, election_id, file_details, election_details) return election_id = get_election_id(election_details, db) election_details["election_id"] = election_id print "done getting feed details" print "converting to full db files...." element_counts, error_data, warning_data = convert_to_db_files( vip_id, election_id, file_details["file_timestamp"], DIRECTORIES["temp"], sp) print "done converting to full db files" er.report_summary(vip_id, election_id, file_details, election_details, element_counts) if len(error_data) > 0: er.feed_issues(vip_id, file_details["file_timestamp"], error_data, "error") if len(warning_data) > 0: er.feed_issues(vip_id, file_details["file_timestamp"], warning_data, "warning") update_data(vip_id, election_id, file_details["file_timestamp"], db, element_counts, DIRECTORIES["temp"], DIRECTORIES["archives"]) db_validations(vip_id, election_id, db, sp) generate_feed(file_details)
print "sending files" k.set_contents_from_filename(output_file) conn.close() def clean_directory(directory): if not directory.endswith("/"): return directory + "/" return directory config = ConfigParser() config.read(CONFIG_FILE) schema_file = get_schema_file() sp = SchemaProps(schema_file) db_file_list = sp.key_list("db") file_directory = config.get("local_settings", "file_directory") file_directory = clean_directory(file_directory) key = config.get("connection_settings", "key") bucket = config.get("connection_settings", "bucket") directory = config.get("connection_settings", "output_folder") connection = sqlite3.connect(config.get("app_settings", "db_host")) cursor = connection.cursor() setup_db() files_to_send = [] xml_file = None config_file = None
k = Key(b) k.key = directory + output_file print "sending files" k.set_contents_from_filename(output_file) conn.close() def clean_directory(directory): if not directory.endswith("/"): return directory + "/" return directory config = ConfigParser() config.read(CONFIG_FILE) schema_file = get_schema_file() sp = SchemaProps(schema_file) db_file_list = sp.key_list("db") file_directory = config.get("local_settings", "file_directory") file_directory = clean_directory(file_directory) key = config.get("connection_settings", "key") bucket = config.get("connection_settings", "bucket") directory = config.get("connection_settings", "output_folder") connection = sqlite3.connect(config.get("app_settings", "db_host")) cursor = connection.cursor() setup_db() files_to_send = [] xml_file = None config_file = None