def process_json_listens(self, filename, data_dir, tmp_hdfs_path, append, schema): """ Process a file containing listens from the ListenBrainz dump and add listens to appropriate dataframes. Args: filename (str): File name of JSON file. data_dir (str): Dir to save listens to in HDFS as parquet. tmp_hdfs_path (str): HDFS path where listens JSON has been uploaded. append (bool): If true append to end of parquet rather than write. schema: Schema of the listens """ start_time = time.monotonic() df = utils.read_json(tmp_hdfs_path, schema=schema) if filename.split('/')[-1] == 'invalid.json': dest_path = os.path.join(data_dir, 'invalid.parquet') else: year = filename.split('/')[-2] month = filename.split('/')[-1][0:-5] dest_path = os.path.join(data_dir, year, '{}.parquet'.format(str(month))) if append and utils.path_exists(dest_path): utils.save_parquet(df, dest_path, mode="append") else: utils.save_parquet(df, dest_path, mode="overwrite") logger.info("Uploading to {}...".format(dest_path)) logger.info( "File processed in {:.2f} seconds!".format(time.monotonic() - start_time))
def process_json_listens(self, filename, data_dir, tmp_hdfs_path, schema): """ Process a file containing listens from the ListenBrainz dump and add listens to appropriate dataframes. Args: filename (str): File name of JSON file. data_dir (str): Dir to save listens to in HDFS as parquet. tmp_HDFS_path (str): HDFS path where listens JSON has been uploaded. """ start_time = time.time() df = utils.read_json(tmp_hdfs_path, schema=schema) current_app.logger.info("Processing {} listens...".format(df.count())) if filename.split('/')[-1] == 'invalid.json': dest_path = os.path.join(data_dir, 'invalid.parquet') else: year = filename.split('/')[-2] month = filename.split('/')[-1][0:-5] dest_path = os.path.join(data_dir, year, '{}.parquet'.format(str(month))) current_app.logger.info("Uploading to {}...".format(dest_path)) utils.save_parquet(df, dest_path) current_app.logger.info( "File processed in {:.2f} seconds!".format(time.time() - start_time))
def process_json(self, _, dest_path, tmp_hdfs_path, __, schema): """ Read JSON from HDFS as a dataframe and upload to HDFS as a parquet. Args: dest_path (str): HDFS path to upload JSON as parquet. tmp_hdfs_path (str): HDFS path where JSON has been uploaded. """ start_time = time.monotonic() df = utils.read_json(tmp_hdfs_path, schema=schema) logger.info("Processing {} rows...".format(df.count())) logger.info("Uploading to {}...".format(dest_path)) utils.save_parquet(df, dest_path) logger.info("File processed in {:.2f} seconds!".format(time.monotonic() - start_time))