def feed_to_db_files(directory, feed_file):
    with open(feed_file) as xml_doc:
        context = etree.iterparse(xml_doc, events=("start", "end"))
        context = iter(context)

        event, root = context.next()
        feed_version = root.attrib["schemaVersion"]

        sp = SchemaProps(feed_version)
        db_props = sp.full_header_data("db")

        def process_element(element, writer):
            elem_dict, extras = process_db_sub_elems(elem, db_props)
            try:
                writer.writerow(elem_dict)
            except UnicodeEncodeError:
                for key in elem_dict.keys():
                    if elem_dict[key]:
                        elem_dict[key] = elem_dict[key].replace(u"\ufffd", "N")
                        elem_dict[key] = elem_dict[key].replace(u'\u201d', "")
                        elem_dict[key] = elem_dict[key].replace(u'\u201c', "")
                        elem_dict[key] = elem_dict[key].replace(u'\u2019', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xbd', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xf3', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xa0', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xe9', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xe1', "")
                writer.writerow(elem_dict)

            for extra in extras:
                with FileWriter(
                    directory, extra["table"],
                    db_props[extra["table"]]) as temp_writer:
                    temp_writer.writerow(extra["elements"])
        element_gen = extract_base_elements(context, db_props.keys())
        for elem in element_gen:
            with FileWriter(directory, elem.tag, db_props[elem.tag]) as writer:
                # process first row to handle lookahead for section ending
                process_element(elem, writer)
                if elem.getnext() is None or elem.getnext().tag != elem.tag:
                    continue
                for elem in element_gen:
                    process_element(elem, writer)
                    if elem.getnext() is None or elem.getnext().tag != elem.tag:
                        break

    if feed_version != "3.0":
        update_version(directory, feed_version)
Example #2
0
def feed_to_db_files(directory, feed_file):
    with open(feed_file) as xml_doc:
        context = etree.iterparse(xml_doc, events=("start", "end"))
        context = iter(context)

        event, root = context.next()
        feed_version = root.attrib["schemaVersion"]

        sp = SchemaProps(feed_version)
        db_props = sp.full_header_data("db")

        e_name = ""
        for elem in extract_base_elements(context, db_props.keys()):
            if elem.tag != e_name:
                e_name = elem.tag
                writer = file_writer(directory, e_name, db_props[e_name])
            elem_dict, extras = process_db_sub_elems(elem, db_props)
            try:
                writer.writerow(elem_dict)
            except UnicodeEncodeError:
                for key in elem_dict.keys():
                    if elem_dict[key]:
                        elem_dict[key] = elem_dict[key].replace(u"\ufffd", "N")
                        elem_dict[key] = elem_dict[key].replace(u'\u201d', "")
                        elem_dict[key] = elem_dict[key].replace(u'\u201c', "")
                        elem_dict[key] = elem_dict[key].replace(u'\u2019', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xbd', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xf3', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xa0', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xe9', "")
                        elem_dict[key] = elem_dict[key].replace(u'\xe1', "")
                writer.writerow(elem_dict)

            for extra in extras:
                temp_writer = file_writer(directory, extra["table"],
                                          db_props[extra["table"]])
                temp_writer.writerow(extra["elements"])
    if feed_version != "3.0":
        update_version(directory, feed_version)
Example #3
0
def feed_to_db_files(directory, feed_file):
	with open(feed_file) as xml_doc:
		context = etree.iterparse(xml_doc, events=("start", "end"))
		context = iter(context)

		event, root = context.next()
		feed_version = root.attrib["schemaVersion"]

		sp = SchemaProps(feed_version)
		db_props = sp.full_header_data("db")

		e_name = ""
		for elem in extract_base_elements(context, db_props.keys()):
			if elem.tag != e_name:
				e_name = elem.tag
				writer = file_writer(directory, e_name, db_props[e_name])
			elem_dict, extras = process_db_sub_elems(elem, db_props)
			try:
				writer.writerow(elem_dict)
			except UnicodeEncodeError:
				for key in elem_dict.keys():
					if elem_dict[key]:
						elem_dict[key] = elem_dict[key].replace(u"\ufffd", "N")
						elem_dict[key] = elem_dict[key].replace(u'\u201d', "")
						elem_dict[key] = elem_dict[key].replace(u'\u201c', "")
						elem_dict[key] = elem_dict[key].replace(u'\u2019', "")
						elem_dict[key] = elem_dict[key].replace(u'\xbd', "")
						elem_dict[key] = elem_dict[key].replace(u'\xf3', "")
						elem_dict[key] = elem_dict[key].replace(u'\xa0', "")
						elem_dict[key] = elem_dict[key].replace(u'\xe9', "")
						elem_dict[key] = elem_dict[key].replace(u'\xe1', "")
				writer.writerow(elem_dict)

			for extra in extras:
				temp_writer = file_writer(directory, extra["table"], db_props[extra["table"]])
				temp_writer.writerow(extra["elements"])
	if feed_version != "3.0":
		update_version(directory, feed_version)
Example #4
0
def main():

	print "setting up directories..."
	
	dt.clear_or_create(DIRECTORIES["temp"])
	dt.create_directory(DIRECTORIES["archives"])
	
	print "done setting up directories"

	ftype = ft.get_type(unpack_file)

	print "unpacking and flattening files..."

	unpack.unpack(unpack_file, DIRECTORIES["temp"])
	unpack.flatten_folder(DIRECTORIES["temp"])
# I could have flatten_folder return a list of files in the directory, so that
# we wouldn't have to search through the directory everytime for specific files
# since os.walk is slow with directories with large files

	print "done unpacking and flattening"

	sp = SchemaProps(SCHEMA_URL)
	file_details = {"file":unpack_file, "process_time":process_time, "file_timestamp":file_timestamp}
	election_details = {}
	vip_id = None
	election_id = None

	print "converting to db style flat files...."

	if dt.file_by_name(CONFIG_FILE, DIRECTORIES["temp"]):
		file_details.update(process_config(DIRECTORIES["temp"], DIRECTORIES["temp"] + CONFIG_FILE, sp))
	if dt.files_by_extension(".txt", DIRECTORIES["temp"]) > 0:
		file_details.update(process_flatfiles(DIRECTORIES["temp"], sp))
	print "processing xml files..."
	xml_files = dt.files_by_extension(".xml", DIRECTORIES["temp"])
	if len(xml_files) >= 1:
		ftff.feed_to_db_files(DIRECTORIES["temp"], xml_files[0], sp.full_header_data("db"), sp.version)
		os.remove(xml_files[0])
		if "valid_files" in file_details:
			file_details["valid_files"].append(xml_files[0])
		else:
			file_details["valid_files"] = [xml_files[0]]

	print "done processing xml files"

	print "getting feed details..."
	db = EasySQL("localhost","vip","username","password")
	try:
		with open(DIRECTORIES["temp"] + "source.txt", "r") as f:
			reader = csv.DictReader(f)
			row = reader.next()
			vip_id = row["vip_id"]
			election_details["vip_id"] = vip_id
		with open(DIRECTORIES["temp"] + "election.txt", "r") as f:
			reader = csv.DictReader(f)
			row = reader.next()
			election_details["election_date"] = row["date"]
			election_details["election_type"] = row["election_type"]
	except:
		er.report_summary(vip_id, election_id, file_details, election_details)
		return

	election_id = get_election_id(election_details, db)
	election_details["election_id"] = election_id
	print "done getting feed details"

	print "converting to full db files...."
	element_counts, error_data, warning_data = convert_to_db_files(vip_id, election_id, file_details["file_timestamp"], DIRECTORIES["temp"], sp)
	print "done converting to full db files"
	
	er.report_summary(vip_id, election_id, file_details, election_details, element_counts)
	if len(error_data) > 0:
		er.feed_issues(vip_id, file_details["file_timestamp"], error_data, "error")
	if len(warning_data) > 0:
		er.feed_issues(vip_id, file_details["file_timestamp"], warning_data, "warning")

	update_data(vip_id, election_id, file_details["file_timestamp"], db, element_counts, DIRECTORIES["temp"], DIRECTORIES["archives"])	

	db_validations(vip_id, election_id, db, sp)

	generate_feed(file_details)
username = args.username
password = args.password
conn = psycopg2.connect(host=host,
                        database=db_name,
                        user=username,
                        password=password)

create_db.clear_setup_db(LOCATION, conn)

db = EasySQL(host, db_name, username, password)

process_dir = path.join(TMP_DIR, "vip_feed")
dt.clear_or_create(process_dir)
dt.clear_or_create(REPORT_DIR)

sp = SchemaProps(SCHEMA_URL)

#should have a call to get file type here sometime, and then call the necessary functions
#data_type = get_type(DIR)

#the db_flat type should not be an issue, because by the time it gets here, the data should have already been converted
#if data_type == "db_flat":

file_issues = check_db_flat(DATA_DIR, sp)
counts, errors, warnings = validation.file_validation(
    DATA_DIR, process_dir, file_issues["valid_files"], sp)

for f in os.listdir(process_dir):
    with open(path.join(process_dir, f), "r") as r:
        reader = DictReader(r)
        db.copy_upload(f.split(".")[0], reader.fieldnames, r.name)
Example #6
0
def main():

    print "setting up directories..."

    dt.clear_or_create(DIRECTORIES["temp"])
    dt.create_directory(DIRECTORIES["archives"])

    print "done setting up directories"

    ftype = ft.get_type(unpack_file)

    print "unpacking and flattening files..."

    unpack.unpack(unpack_file, DIRECTORIES["temp"])
    unpack.flatten_folder(DIRECTORIES["temp"])
    # I could have flatten_folder return a list of files in the directory, so that
    # we wouldn't have to search through the directory everytime for specific files
    # since os.walk is slow with directories with large files

    print "done unpacking and flattening"

    sp = SchemaProps(SCHEMA_URL)
    file_details = {
        "file": unpack_file,
        "process_time": process_time,
        "file_timestamp": file_timestamp
    }
    election_details = {}
    vip_id = None
    election_id = None

    print "converting to db style flat files...."

    if dt.file_by_name(CONFIG_FILE, DIRECTORIES["temp"]):
        file_details.update(
            process_config(DIRECTORIES["temp"],
                           DIRECTORIES["temp"] + CONFIG_FILE, sp))
    if dt.files_by_extension(".txt", DIRECTORIES["temp"]) > 0:
        file_details.update(process_flatfiles(DIRECTORIES["temp"], sp))
    print "processing xml files..."
    xml_files = dt.files_by_extension(".xml", DIRECTORIES["temp"])
    if len(xml_files) >= 1:
        ftff.feed_to_db_files(DIRECTORIES["temp"], xml_files[0],
                              sp.full_header_data("db"), sp.version)
        os.remove(xml_files[0])
        if "valid_files" in file_details:
            file_details["valid_files"].append(xml_files[0])
        else:
            file_details["valid_files"] = [xml_files[0]]

    print "done processing xml files"

    print "getting feed details..."
    db = EasySQL("localhost", "vip", "username", "password")
    try:
        with open(DIRECTORIES["temp"] + "source.txt", "r") as f:
            reader = csv.DictReader(f)
            row = reader.next()
            vip_id = row["vip_id"]
            election_details["vip_id"] = vip_id
        with open(DIRECTORIES["temp"] + "election.txt", "r") as f:
            reader = csv.DictReader(f)
            row = reader.next()
            election_details["election_date"] = row["date"]
            election_details["election_type"] = row["election_type"]
    except:
        er.report_summary(vip_id, election_id, file_details, election_details)
        return

    election_id = get_election_id(election_details, db)
    election_details["election_id"] = election_id
    print "done getting feed details"

    print "converting to full db files...."
    element_counts, error_data, warning_data = convert_to_db_files(
        vip_id, election_id, file_details["file_timestamp"],
        DIRECTORIES["temp"], sp)
    print "done converting to full db files"

    er.report_summary(vip_id, election_id, file_details, election_details,
                      element_counts)
    if len(error_data) > 0:
        er.feed_issues(vip_id, file_details["file_timestamp"], error_data,
                       "error")
    if len(warning_data) > 0:
        er.feed_issues(vip_id, file_details["file_timestamp"], warning_data,
                       "warning")

    update_data(vip_id, election_id, file_details["file_timestamp"], db,
                element_counts, DIRECTORIES["temp"], DIRECTORIES["archives"])

    db_validations(vip_id, election_id, db, sp)

    generate_feed(file_details)
Example #7
0
    print "sending files"
    k.set_contents_from_filename(output_file)
    conn.close()


def clean_directory(directory):
    if not directory.endswith("/"):
        return directory + "/"
    return directory


config = ConfigParser()
config.read(CONFIG_FILE)

schema_file = get_schema_file()
sp = SchemaProps(schema_file)
db_file_list = sp.key_list("db")

file_directory = config.get("local_settings", "file_directory")
file_directory = clean_directory(file_directory)
key = config.get("connection_settings", "key")
bucket = config.get("connection_settings", "bucket")
directory = config.get("connection_settings", "output_folder")

connection = sqlite3.connect(config.get("app_settings", "db_host"))
cursor = connection.cursor()
setup_db()

files_to_send = []
xml_file = None
config_file = None
Example #8
0
	k = Key(b)
	k.key = directory + output_file
	print "sending files"
	k.set_contents_from_filename(output_file)
	conn.close()

def clean_directory(directory):
	if not directory.endswith("/"):
		return directory + "/"
	return directory

config = ConfigParser()
config.read(CONFIG_FILE)

schema_file = get_schema_file()
sp = SchemaProps(schema_file)
db_file_list = sp.key_list("db")

file_directory = config.get("local_settings", "file_directory")
file_directory = clean_directory(file_directory)
key = config.get("connection_settings", "key")
bucket = config.get("connection_settings", "bucket")
directory = config.get("connection_settings", "output_folder")

connection = sqlite3.connect(config.get("app_settings", "db_host"))
cursor = connection.cursor()
setup_db()

files_to_send = []
xml_file = None
config_file = None