Esempio n. 1
0
"""
Run the script to clean the raw listings data.
"""

import csv
import os

import numpy as np
import pandas as pd

from modules.datareader import DataReader
from modules.logger import Logger, DEBUG, INFO, WARNING, ERROR, CRITICAL
from modules.exceptions import ColumnNotFoundException

logfile = os.path.join(Logger.get_logfile_dir(__file__), 'logs', 'clean.log')
_ = Logger.create(__file__, logfile, DEBUG)
_.addHandler(Logger.create_file_handler(logfile, DEBUG))
_.addHandler(Logger.create_stream_handler(WARNING))

if __name__ == '__main__':
    _.info('Cleaning data...')
    data_reader = DataReader('./data/listings.csv')

    try:
        data_reader.filter_columns(
            'id', 'description', 'host_since', 'host_is_superhost',
            'host_listings_count', 'host_verifications', 'host_has_profile_pic',
            'host_identity_verified', 'neighbourhood_cleansed', 'zipcode',
            'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
            'beds', 'bed_type', 'amenities', 'price', 'guests_included', 'extra_people',
Esempio n. 2
0
# coding: utf-8
import csv
import os

import numpy as np
import pandas as pd

from modules.logger import Logger, DEBUG, INFO, WARNING, ERROR, CRITICAL
from modules.exceptions import ColumnNotFoundException

logfile = os.path.join(Logger.get_logfile_dir(__file__), '..', 'logs', 'datareader.log')
_ = Logger.create(__file__, logfile, DEBUG)
_.addHandler(Logger.create_file_handler(logfile, DEBUG))
_.addHandler(Logger.create_stream_handler(WARNING))


class DataReader(object):
    """
    Reads the CSV data file then cleans it
    """

    def __init__(self, fname, delim=","):
        self.delim = delim
        
        self.header = []
        self.data = []
        self.rows = 0
        self.cols = 0

        _.debug("Reading %s" % fname) 
        with open(fname, 'r') as rf:
"""
Available data formats:
  - Pandas DataFrame
"""

import os

import numpy as np
import pandas as pd

from datetime import datetime
from modules.datareader import DataReader
from modules.logger import Logger, DEBUG, INFO, WARNING, ERROR, CRITICAL

logfile = os.path.join(Logger.get_logfile_dir(__file__), '..', 'logs', 'wrangle.log')
_ = Logger.create(__file__, logfile, DEBUG)
_.addHandler(Logger.create_file_handler(logfile, DEBUG))
_.addHandler(Logger.create_stream_handler(WARNING))


def make_dataframe(fpath):
	df = pd.read_csv(fpath).dropna(subset=['host_since'])

	# Dates
	df['host_since_days'] = (pd.to_datetime('now') - pd.to_datetime(df['host_since'])).dt.days.astype(int)

	# Booleans
	df['host_is_superhost'] = df['host_is_superhost'] == 't'
	df['host_has_profile_pic'] = df['host_has_profile_pic'] == 't'
	df['host_identity_verified'] = df['host_identity_verified'] == 't'