Exemple #1
0
def clean_merge_and_save(data_filename='df1.feather',
                         weather_filename='weather_df.feather',
                         save_as=None,
                         raw_dir=utils.path_to('data', 'raw'),
                         interim_dir=utils.path_to('data', 'interim')):
    """ Read and clean raw data, merge, and save DataFrame
    [Arg]   data_file_path
            weather_file_path
    """
    # read data
    df_data = pd.read_feather(os.path.join(raw_dir, data_filename))
    df_weather = pd.read_feather(os.path.join(raw_dir, weather_filename))

    # call clean function
    df_data = data.clean_dataframe_p1(df_data)

    df_weather = data.clean_weather_df(df_weather)

    # merge data
    df_concat = data.merge_df1_and_weather(df_data, df_weather)

    # save data
    if save_as is not None:
        feather_filepath = os.path.join(interim_dir, save_as)
        print('Writing feather file to {}'.format(feather_filepath))
        df_concat.to_feather(feather_filepath)

    return df_concat
def unzipping_zip_files(file_name, unzip_dir=utils.path_to('data',
                                                           'external')):
    file_path = os.path.join(unzip_dir, file_name)
    with ZipFile(file_path, 'r') as zipObj:
        # Extract all the contents of zip file in current directory
        zipObj.extractall(unzip_dir)
    return (print('file unzipped'))
Exemple #3
0
 def read(self):
     data = []
     with open(utils.path_to(self.filename), 'rb') as f:
         reader = DictReader(f)
         for row in reader:
             if not all(v is '' for k, v in row.iteritems()):
                 data.append(row)
     return data
Exemple #4
0
    def test_various(self):
        result = unittest.TestResult()
        suite = QUnitSuite(path_to('success.html'))
        suite(result)

        self.assertEqual(result.skipped, [])
        self.assertEqual(result.errors, [])
        self.assertEqual(result.failures, [])
        self.assertEqual(result.testsRun, 6)
Exemple #5
0
    def test_polyfills(self):
        result = unittest.TestResult()
        suite = QUnitSuite(path_to('polyfill.html'))
        suite(result)

        self.assertEqual(result.skipped, [])
        self.assertEqual(result.errors, [])
        self.assertEqual(result.failures, [])
        self.assertEqual(result.testsRun, 1)
Exemple #6
0
def execute_query_and_save_df(
        query_filename,
        feather_filename = None, *,
        query_dir = utils.path_to('src', 'data'),
        feather_dir = utils.path_to('data', 'raw')
):
    """read SQL query from file, execute query, return pandas dataframe and
    optionally save pandas dataframe at given file path in feather format

    Arguments:
    query_filename (str): name of the query file
    feather_filename (str, optional): name of the feather file to write

    Keyword Arguments:
    feather_dir (str, optional): directory where the feather file is written
    query_dir  (str, optional): directory where the query is stored

    Returns:
    pd.DataFrame: pandas dataframe with the query result
    """

    print('Opening database connection')
    db_connection = sql.connect(
              host='35.233.4.203',
              user='******',
              passwd='ier2rJZte8rt4fGHj2Sfi',
              database='s2ds'
    )

    query_filepath = os.path.join(query_dir, query_filename)
    print('Querying database with query in ' + query_filepath)
    query_string = utils.read_file_as_string(query_filepath)
    df = pd.read_sql(query_string, con=db_connection)

    print('Closing database connection')
    db_connection.close()

    if feather_filename is not None:
        feather_filepath = os.path.join(feather_dir, feather_filename)
        write_feather_file(df, feather_filepath)

    return df
Exemple #7
0
    def test_various(self):
        result = unittest.TestResult()
        suite = QUnitSuite(path_to('failure.html'))
        suite(result)

        self.assertEqual(result.skipped, [])
        self.assertEqual(result.errors, [])
        self.assertEqual(len(result.failures), 10)
        # used to check messages, but source may be randomly added
        # based on exact phantomjs version so f**k that

        self.assertEqual(result.testsRun, 8)
Exemple #8
0
    def test_timeout(self):
        result = unittest.TestResult()
        # lower timeout to not blow up test suite runtime worse than now
        suite = QUnitSuite(path_to('timeout.html'), timeout=500)
        suite(result)

        self.assertEqual(result.skipped, [])
        self.assertEqual(result.testsRun, 1)
        self.assertEqual(result.failures, [])
        self.assertEqual(len(result.errors), 1)
        test, message = result.errors[0]
        self.assertEqual(str(test), "phantomjs: startup")
        self.assertTrue(message.startswith("PhantomJS timed out"))
def save_model(classifier, save_as, model_dir=utils.path_to('models')):
    """ Save classifier 

    Arguments:
        classifier (sklearn API classifier): classifier object
        save_as (string): filename
        model_dir (string): filepath 
    """
    try:
        pickle.dump(obj=classifier,
                    file=open(os.path.join(model_dir, save_as), "wb"))
        print('Saved: {}.'.format(type(classifier)))
    except:
        print('Not successful! {}.'.format(type(classifier)))
def get_point_heatmap(df, location_lng, location_lat,
                      shape_file_name='london_shape.bin',
                      shape_file_path=utils.path_to('src', 'viz')):
    """
    """

    # load shape files
    london_shp = _load_shape_file(shape_file_name, shape_file_path)

    # data points to gdp
    gdf = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'},
                           geometry=[Point(xy) for xy in zip(location_lng, location_lat)])

    return gdf, london_shp
Exemple #11
0
    def test_ensure_directories(self):
        folder_name = '-temp-test-folder-should-be-removed'
        data_path = utils.path_to('data', folder_name, 'data.csv')

        # Make sure the path does not exists
        path = os.path.dirname(data_path)
        if os.path.exists(path):
            os.rmdir(path)

        utils.ensure_directories(data_path)

        self.assertTrue(os.path.exists(path))

        # Cleanup
        os.rmdir(path)
def get_postcode_heatmap(df, target_str,
                         postcode='pickup_postcode_outer',
                         shape_file_name='london_shape.bin',
                         shape_file_path=utils.path_to('src', 'viz')):
    """
    """
    # load shape files
    london_shp = _load_shape_file(shape_file_name, shape_file_path)

    # Average data within the postcode
    avg_target = df.groupby(postcode)[target_str].mean().round(1)

    # merge on postcode index
    heatmap_gdf = london_shp.merge(avg_target, left_index=True, right_index=True, how='left')

    return heatmap_gdf
def load_model(filename, model_dir=utils.path_to('models')):
    """ Load classifier 

    Arguments:
        filename (string): filename to load
        model_dir (string): filepath 

    Return:
        classifier (sklearn API classifier): loaded classifier

    """
    try:
        classifier = pickle.load(open(os.path.join(model_dir, filename), "rb"))
        print('Loaded: {}.'.format(type(classifier)))
        return classifier
    except:
        print('Not Successful!')
Exemple #14
0
            print(str_series[del_pattern].value_counts())
            postcode_df = postcode_df[~del_pattern]
            str_series = str_series[~del_pattern]

        # Delete districts outside London
        del_pattern = ~postcode_df.is_london
        if del_pattern.sum() > 0:
            print(
                'Deleting {} districts outside London, listing districts with freq >= 100'
                .format(del_pattern.sum()))
            print_series = postcode_df.district[del_pattern].value_counts()
            print(print_series[print_series >= 100])
            postcode_df = postcode_df[~del_pattern]
            postcode_df.drop(columns='is_london', inplace=True)
            str_series = str_series[~del_pattern]

        # Delete unrecognized London outcode
        del_pattern = postcode_df.outcode.isna()
        if del_pattern.sum() > 0:
            print('Deleting {} unrecognized London outcodes'.format(
                del_pattern.sum()))
            print(str_series[del_pattern].value_counts().head(20))
            postcode_df = postcode_df[~del_pattern]

    return postcode_df


if __name__ == "__main__":
    df = pd.read_feather(utils.path_to('data', 'raw', 'jobs.feather'))
    clean_dataframe_p1(df.head(100))
def downloading_scripts(file_name,url,download_dir=utils.path_to('data', 'external')):
    download_path = os.path.join(download_dir,file_name)
    utils.ensure_directories(download_path)
    urllib.request.urlretrieve(url, download_path)
    return(print('file downloaded'))
import pandas as pd
import utils


def add_event_outcome(jobs, jobs_history):
    df = jobs.merge(jobs_history, on='job_id', suffixes=['', '_history'])
    df.event = pd.Categorical(df.event, categories=['accepted', 'rejected'])
    return df


if __name__ == '__main__':
    INPATH = utils.path_to('data', 'final', 'df_clean_jobs.feather')
    INPATH_HISTORY = utils.path_to('data', 'raw', 'jobs_history.feather')
    OUTPATH = utils.path_to('data', 'final', 'df_clean_event.feather')

    print('Reading feather file from ' + INPATH)
    jobs = pd.read_feather(INPATH)
    print('Reading feather file from ' + INPATH_HISTORY)
    jobs_history = pd.read_feather(INPATH_HISTORY)

    print('Merging dataframes')
    df = add_event_outcome(jobs, jobs_history)

    print('Writing feather file to ' + OUTPATH)
    utils.ensure_directories(OUTPATH)
    df.to_feather(OUTPATH)
Exemple #17
0
    print('Unchanged features: ' + ', '.join(colnames))
    # return unchanged columns of input dataframe if include_pass is True, empty dataframe otherwise
    df_out = pd.DataFrame()
    if include_pass:
        df_out = df[colnames]
    return df_out


def _timed_categories(df, colnames, *, cycletypes):
    print('Timed categorising: ' + ', '.join(colnames))
    # extract attributes of datetime vars
    df_out = pd.DataFrame()
    for col in colnames:
        for cycle in cycletypes:
            retriever = attrgetter(cycle.attribute)
            df_out['_'.join([col, cycle.section_name])] = retriever(df[col])

    return df_out


if __name__ == "__main__":
    df = pd.read_feather(utils.path_to('data', 'interim', 'clean.feather'))

    print('Use case 1: Add columns to original df')
    feature_df, feature_names = generate_features(df.head(100))

    print('\nUse case 2: Return only feature columns')
    intermediate_df = pd.concat(
        [df.head(100), intermediate_variables(df.head(100))], axis=1)
    feature_df, feature_names = feature_encoding(intermediate_df)
Exemple #18
0
    def test_path_to(self):
        project_path = utils.project_path()
        data_path = utils.path_to('data')

        self.assertEqual(data_path, f'{project_path}/data')
def read_postcode_csv(csv_file):
    csv_table = pd.read_csv(utils.path_to('src', 'features', csv_file))

    zone_lookup = dict(zip(csv_table['postcode'], csv_table['zone']))

    return zone_lookup