Ejemplo n.º 1
0
import pandas as pd
from lib_cinci import data_folder
import os

path_to_data_folder = data_folder.for_file(__file__)

os.chdir(os.path.join(path_to_data_folder, 'tmp'))
print 'Working in folder: {}'.format(path_to_data_folder)

#Load csv file, parse DATE column
df = pd.read_csv("fire.csv", parse_dates=['incident_date'])
print 'Raw file has {:,d} rows and {:,d} columns'.format(*df.shape)

#Lowercase column names
df.columns = df.columns.map(lambda s: s.lower())

df.rename(columns={'street_address': 'address'}, inplace=True)

#Check how many rows have empty addresses
print '{:,d} rows with empty address, removing those'.format(df.address.isnull().sum())
#Remove rows without address
df = df[df.address.notnull()]

#Check for duplicates
duplicates = df.duplicated()
n_duplicates = duplicates.sum()
print 'Found {:,d} duplicates, dropping them'.format(n_duplicates)
df = df[~duplicates]

#Strip some columns
#csvsql has a bug that is not producing the
Ejemplo n.º 2
0
year = int(sys.argv[2])

# Set folder where this file is located as working direcory
script_dir = os.path.abspath(os.path.dirname(__file__))
os.chdir(script_dir)

print "Loading definitions.yaml from: %s" % os.getcwd()

with open("definitions.yaml") as f:
    definitions = yaml.load(f.read())

names = definitions["names"][year]

# Move current directory do all I/O operations take place in the corresponding
# Data folder
data_folder = data_folder.for_file(__file__)
os.chdir(data_folder)

print "Changing working dir to: %s" % os.getcwd()

# Create tmp file if it does not exist
if not os.path.exists("tmp"):
    print "Creating tmp folder in %s" % os.getcwd()
    os.makedirs("tmp")

print "Loading data from %d..." % year

# Force all columns to be read as strings to prevent pandas elminating leading 0s
# and other weird stuff. The are some columns with only one blank space, interpret those
# as NA
df = pd.read_csv(input_file, names=names, dtype=np.str, na_values=[" "])
import pandas as pd
from lib_cinci import data_folder
import os
import sys

path_to_data_folder = data_folder.for_file(__file__)

os.chdir(os.path.join(path_to_data_folder, 'tmp'))
print 'Working in folder: %s' % path_to_data_folder

input_filename = "diff_crime.csv"
output_filename = "diff_crime_clean.csv"

#Load csv file
df = pd.read_csv(input_filename, dtype=object)
print 'Raw file has {:,d} rows and {:,d} columns'.format(*df.shape)

#Lowercase column names
df.columns = df.columns.map(lambda s: s.lower())

df['occurredon'] = pd.to_datetime(df['occurredon'])
df = df.set_index('occurredon')

#Rename some columns
mapping = {'location': 'address', 'addressstate': 'state'}
df.index.rename('occurred_on', inplace=True)
df.rename(columns=mapping, inplace=True)

#We are only using data starting from 2012
df = df[df.index.year >= 2012]
print 'Subset from 2012 has {:,d} rows and {:,d} columns'.format(*df.shape)
Ejemplo n.º 4
0
import os
import pandas as pd
from lib_cinci import data_folder

#Move current directory do all I/O operations take place in the corresponding
#Data folder
data_folder = data_folder.for_file(__file__)

os.chdir(data_folder)

#Create tmp file if it does not exist
if not os.path.exists('tmp'):
    print('Creating tmp folder in %s' % os.getcwd())
    os.makedirs('tmp')

#Move to tmp folder
os.chdir('tmp')

print('Changing working dir to: %s' % os.getcwd())


def read_for_year(year):
    df = pd.read_csv("owners_{}_resolved.csv".format(year))
    df["parcel_id"] = df["parcel_id"].astype(str)

    #dupes = df.groupby(level=0).filter(lambda x: len(x) > 1)
    df = df.drop_duplicates(subset='parcel_id')

    df = df.set_index("parcel_id")
    entities = df["entity"]
    entities.name = "owner_" + str(year)
    #Step zero: read from yaml file
    parser = argparse.ArgumentParser()
    parser.add_argument("update_file",
                        help="Path to yaml file with configuration parameters")
    args = parser.parse_args()

    with open(args.update_file, 'r') as f:
        params = yaml.load(f)

    db_column = params['storage']['column']
    file_column = params['source']['column']
    schema = params['storage']['schema']

    #Folder to use for I/O
    folder = data_folder.for_file(args.update_file)
    os.chdir(folder)
    logger.info('Using {} for I/O operations'.format(folder))

    #Step one: download file from remote server if user provided url
    try:
        url = params['source']['url']
    except Exception, e:
        logger.info('URL was not present in the configuration file...')
    else:
        logger.info('Downloading file...'.format(folder))
        data_file = urllib2.urlopen(url)
        #Dowload file replacing it if already exists
        with open(params['source']['filename'], 'wb') as output:
            output.write(data_file.read())
Ejemplo n.º 6
0
    logger = logging.getLogger()

    #Step zero: read from yaml file
    parser = argparse.ArgumentParser()
    parser.add_argument("update_file", help="Path to yaml file with configuration parameters")
    args = parser.parse_args()

    with open(args.update_file, 'r') as f:
        params = yaml.load(f)

    db_column = params['storage']['column']
    file_column = params['source']['column']
    schema = params['storage']['schema']

    #Folder to use for I/O
    folder = data_folder.for_file(args.update_file)
    os.chdir(folder)
    logger.info('Using {} for I/O operations'.format(folder))

    #Step one: download file from remote server if user provided url
    try:
        url = params['source']['url']
    except Exception, e:
        logger.info('URL was not present in the configuration file...')
    else:
        logger.info('Downloading file...'.format(folder))
        data_file = urllib2.urlopen(url)
        #Dowload file replacing it if already exists
        with open(params['source']['filename'],'wb') as output:
            output.write(data_file.read())