import pandas as pd from lib_cinci import data_folder import os path_to_data_folder = data_folder.for_file(__file__) os.chdir(os.path.join(path_to_data_folder, 'tmp')) print 'Working in folder: {}'.format(path_to_data_folder) #Load csv file, parse DATE column df = pd.read_csv("fire.csv", parse_dates=['incident_date']) print 'Raw file has {:,d} rows and {:,d} columns'.format(*df.shape) #Lowercase column names df.columns = df.columns.map(lambda s: s.lower()) df.rename(columns={'street_address': 'address'}, inplace=True) #Check how many rows have empty addresses print '{:,d} rows with empty address, removing those'.format(df.address.isnull().sum()) #Remove rows without address df = df[df.address.notnull()] #Check for duplicates duplicates = df.duplicated() n_duplicates = duplicates.sum() print 'Found {:,d} duplicates, dropping them'.format(n_duplicates) df = df[~duplicates] #Strip some columns #csvsql has a bug that is not producing the
year = int(sys.argv[2]) # Set folder where this file is located as working direcory script_dir = os.path.abspath(os.path.dirname(__file__)) os.chdir(script_dir) print "Loading definitions.yaml from: %s" % os.getcwd() with open("definitions.yaml") as f: definitions = yaml.load(f.read()) names = definitions["names"][year] # Move current directory do all I/O operations take place in the corresponding # Data folder data_folder = data_folder.for_file(__file__) os.chdir(data_folder) print "Changing working dir to: %s" % os.getcwd() # Create tmp file if it does not exist if not os.path.exists("tmp"): print "Creating tmp folder in %s" % os.getcwd() os.makedirs("tmp") print "Loading data from %d..." % year # Force all columns to be read as strings to prevent pandas elminating leading 0s # and other weird stuff. The are some columns with only one blank space, interpret those # as NA df = pd.read_csv(input_file, names=names, dtype=np.str, na_values=[" "])
import pandas as pd from lib_cinci import data_folder import os import sys path_to_data_folder = data_folder.for_file(__file__) os.chdir(os.path.join(path_to_data_folder, 'tmp')) print 'Working in folder: %s' % path_to_data_folder input_filename = "diff_crime.csv" output_filename = "diff_crime_clean.csv" #Load csv file df = pd.read_csv(input_filename, dtype=object) print 'Raw file has {:,d} rows and {:,d} columns'.format(*df.shape) #Lowercase column names df.columns = df.columns.map(lambda s: s.lower()) df['occurredon'] = pd.to_datetime(df['occurredon']) df = df.set_index('occurredon') #Rename some columns mapping = {'location': 'address', 'addressstate': 'state'} df.index.rename('occurred_on', inplace=True) df.rename(columns=mapping, inplace=True) #We are only using data starting from 2012 df = df[df.index.year >= 2012] print 'Subset from 2012 has {:,d} rows and {:,d} columns'.format(*df.shape)
import os import pandas as pd from lib_cinci import data_folder #Move current directory do all I/O operations take place in the corresponding #Data folder data_folder = data_folder.for_file(__file__) os.chdir(data_folder) #Create tmp file if it does not exist if not os.path.exists('tmp'): print('Creating tmp folder in %s' % os.getcwd()) os.makedirs('tmp') #Move to tmp folder os.chdir('tmp') print('Changing working dir to: %s' % os.getcwd()) def read_for_year(year): df = pd.read_csv("owners_{}_resolved.csv".format(year)) df["parcel_id"] = df["parcel_id"].astype(str) #dupes = df.groupby(level=0).filter(lambda x: len(x) > 1) df = df.drop_duplicates(subset='parcel_id') df = df.set_index("parcel_id") entities = df["entity"] entities.name = "owner_" + str(year)
#Step zero: read from yaml file parser = argparse.ArgumentParser() parser.add_argument("update_file", help="Path to yaml file with configuration parameters") args = parser.parse_args() with open(args.update_file, 'r') as f: params = yaml.load(f) db_column = params['storage']['column'] file_column = params['source']['column'] schema = params['storage']['schema'] #Folder to use for I/O folder = data_folder.for_file(args.update_file) os.chdir(folder) logger.info('Using {} for I/O operations'.format(folder)) #Step one: download file from remote server if user provided url try: url = params['source']['url'] except Exception, e: logger.info('URL was not present in the configuration file...') else: logger.info('Downloading file...'.format(folder)) data_file = urllib2.urlopen(url) #Dowload file replacing it if already exists with open(params['source']['filename'], 'wb') as output: output.write(data_file.read())
logger = logging.getLogger() #Step zero: read from yaml file parser = argparse.ArgumentParser() parser.add_argument("update_file", help="Path to yaml file with configuration parameters") args = parser.parse_args() with open(args.update_file, 'r') as f: params = yaml.load(f) db_column = params['storage']['column'] file_column = params['source']['column'] schema = params['storage']['schema'] #Folder to use for I/O folder = data_folder.for_file(args.update_file) os.chdir(folder) logger.info('Using {} for I/O operations'.format(folder)) #Step one: download file from remote server if user provided url try: url = params['source']['url'] except Exception, e: logger.info('URL was not present in the configuration file...') else: logger.info('Downloading file...'.format(folder)) data_file = urllib2.urlopen(url) #Dowload file replacing it if already exists with open(params['source']['filename'],'wb') as output: output.write(data_file.read())