""" Run the script to clean the raw listings data. """ import csv import os import numpy as np import pandas as pd from modules.datareader import DataReader from modules.logger import Logger, DEBUG, INFO, WARNING, ERROR, CRITICAL from modules.exceptions import ColumnNotFoundException logfile = os.path.join(Logger.get_logfile_dir(__file__), 'logs', 'clean.log') _ = Logger.create(__file__, logfile, DEBUG) _.addHandler(Logger.create_file_handler(logfile, DEBUG)) _.addHandler(Logger.create_stream_handler(WARNING)) if __name__ == '__main__': _.info('Cleaning data...') data_reader = DataReader('./data/listings.csv') try: data_reader.filter_columns( 'id', 'description', 'host_since', 'host_is_superhost', 'host_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_cleansed', 'zipcode', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price', 'guests_included', 'extra_people',
# coding: utf-8 import csv import os import numpy as np import pandas as pd from modules.logger import Logger, DEBUG, INFO, WARNING, ERROR, CRITICAL from modules.exceptions import ColumnNotFoundException logfile = os.path.join(Logger.get_logfile_dir(__file__), '..', 'logs', 'datareader.log') _ = Logger.create(__file__, logfile, DEBUG) _.addHandler(Logger.create_file_handler(logfile, DEBUG)) _.addHandler(Logger.create_stream_handler(WARNING)) class DataReader(object): """ Reads the CSV data file then cleans it """ def __init__(self, fname, delim=","): self.delim = delim self.header = [] self.data = [] self.rows = 0 self.cols = 0 _.debug("Reading %s" % fname) with open(fname, 'r') as rf:
""" Available data formats: - Pandas DataFrame """ import os import numpy as np import pandas as pd from datetime import datetime from modules.datareader import DataReader from modules.logger import Logger, DEBUG, INFO, WARNING, ERROR, CRITICAL logfile = os.path.join(Logger.get_logfile_dir(__file__), '..', 'logs', 'wrangle.log') _ = Logger.create(__file__, logfile, DEBUG) _.addHandler(Logger.create_file_handler(logfile, DEBUG)) _.addHandler(Logger.create_stream_handler(WARNING)) def make_dataframe(fpath): df = pd.read_csv(fpath).dropna(subset=['host_since']) # Dates df['host_since_days'] = (pd.to_datetime('now') - pd.to_datetime(df['host_since'])).dt.days.astype(int) # Booleans df['host_is_superhost'] = df['host_is_superhost'] == 't' df['host_has_profile_pic'] = df['host_has_profile_pic'] == 't' df['host_identity_verified'] = df['host_identity_verified'] == 't'