コード例 #1
0
    def process_json_listens(self, filename, data_dir, tmp_hdfs_path, schema):
        """ Process a file containing listens from the ListenBrainz dump and add listens to
            appropriate dataframes.

            Args:
                filename (str): File name of JSON file.
                data_dir (str): Dir to save listens to in HDFS as parquet.
                tmp_HDFS_path (str): HDFS path where listens JSON has been uploaded.
        """
        start_time = time.time()
        df = utils.read_json(tmp_hdfs_path, schema=schema)
        current_app.logger.info("Processing {} listens...".format(df.count()))

        if filename.split('/')[-1] == 'invalid.json':
            dest_path = os.path.join(data_dir, 'invalid.parquet')
        else:
            year = filename.split('/')[-2]
            month = filename.split('/')[-1][0:-5]
            dest_path = os.path.join(data_dir, year,
                                     '{}.parquet'.format(str(month)))

        current_app.logger.info("Uploading to {}...".format(dest_path))
        utils.save_parquet(df, dest_path)
        current_app.logger.info(
            "File processed in {:.2f} seconds!".format(time.time() -
                                                       start_time))
コード例 #2
0
    def upload_test_mapped_listens_to_hdfs(cls, listens_path, mapping_path, mapped_listens_path):
        partial_listen_df = dataframe_utils.get_listens_for_training_model_window(cls.date, cls.date, listens_path)
        df = utils.read_files_from_HDFS(mapping_path)
        mapping_df = mapping_utils.get_unique_rows_from_mapping(df)

        mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df)
        utils.save_parquet(mapped_listens, mapped_listens_path)
コード例 #3
0
    def test_delete_model(self):
        df = utils.create_dataframe(Row(col1=1, col2=1), None)
        utils.save_parquet(df, path.RECOMMENDATION_RECORDING_DATA_DIR)
        train_models.delete_model()

        dir_exists = utils.path_exists(path.RECOMMENDATION_RECORDING_DATA_DIR)
        self.assertFalse(dir_exists)
コード例 #4
0
def save_candidate_sets(top_artist_candidate_set_df,
                        similar_artist_candidate_set_df):
    """ Save candidate sets to HDFS.

        Args:
            top_artist_candidate_set_df (dataframe): recording ids that belong to top artists
                                                     corresponding to user ids.
            similar_artist_candidate_set_df (dataframe): recording ids that belong to similar artists
                                                         corresponding to user ids.
    """
    try:
        utils.save_parquet(
            top_artist_candidate_set_df,
            path.RECOMMENDATION_RECORDING_TOP_ARTIST_CANDIDATE_SET)
    except FileNotSavedException as err:
        logger.error(str(err), exc_info=True)
        raise

    try:
        utils.save_parquet(
            similar_artist_candidate_set_df,
            path.RECOMMENDATION_RECORDING_SIMILAR_ARTIST_CANDIDATE_SET)
    except FileNotSavedException as err:
        logger.error(str(err), exc_info=True)
        raise
コード例 #5
0
    def save_dataframe(self):
        now = datetime.now()

        with open(self.path_to_data_file('user_top_artists.json')) as f:
            data = json.load(f)

        schema = StructType(
            (StructField('user_name', StringType()),
             StructField('artist_name',
                         StringType()), StructField('artist_msid',
                                                    StringType()),
             StructField('artist_mbids', ArrayType(StringType()))))
        df = None
        for entry in data:
            for idx in range(0, entry['count']):
                # Assign listened_at to each listen
                row = utils.create_dataframe(Row(
                    user_name=entry['user_name'],
                    artist_name=entry['artist_name'],
                    artist_msid=entry['artist_msid'],
                    artist_mbids=entry['artist_mbids']),
                                             schema=schema)
                df = df.union(row) if df else row

        utils.save_parquet(
            df,
            os.path.join(self.path_,
                         '{}/{}.parquet'.format(now.year, now.month)))
コード例 #6
0
ファイル: upload.py プロジェクト: mhor/listenbrainz-server
    def process_json_listens(self, filename, data_dir, tmp_hdfs_path, append,
                             schema):
        """ Process a file containing listens from the ListenBrainz dump and add listens to
            appropriate dataframes.

            Args:
                filename (str): File name of JSON file.
                data_dir (str): Dir to save listens to in HDFS as parquet.
                tmp_hdfs_path (str): HDFS path where listens JSON has been uploaded.
                append (bool): If true append to end of parquet rather than write.
                schema: Schema of the listens
        """
        start_time = time.monotonic()
        df = utils.read_json(tmp_hdfs_path, schema=schema)

        if filename.split('/')[-1] == 'invalid.json':
            dest_path = os.path.join(data_dir, 'invalid.parquet')
        else:
            year = filename.split('/')[-2]
            month = filename.split('/')[-1][0:-5]
            dest_path = os.path.join(data_dir, year,
                                     '{}.parquet'.format(str(month)))

        if append and utils.path_exists(dest_path):
            utils.save_parquet(df, dest_path, mode="append")
        else:
            utils.save_parquet(df, dest_path, mode="overwrite")

        logger.info("Uploading to {}...".format(dest_path))
        logger.info(
            "File processed in {:.2f} seconds!".format(time.monotonic() -
                                                       start_time))
コード例 #7
0
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, LISTENS_PATH)
        mapping_df = utils.read_files_from_HDFS(MAPPING_PATH)

        mapped_df = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_df, MAPPED_LISTENS_PATH)
コード例 #8
0
    def test_create_dataframe(self):
        hdfs_path = self.path_ + '/test_df.parquet'
        df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        self.assertEqual(df.count(), 1)
        utils.save_parquet(df, hdfs_path)

        received_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(received_df.count(), 1)
コード例 #9
0
    def upload_test_mapped_listens_to_HDFS(cls):
        partial_listen_df = create_dataframes.get_listens_for_training_model_window(
            cls.date, cls.date, {}, cls.listens_path)
        mapping_df = utils.read_files_from_HDFS(cls.mapping_path)

        mapped_listens = create_dataframes.get_mapped_artist_and_recording_mbids(
            partial_listen_df, mapping_df)
        utils.save_parquet(mapped_listens, cls.mapped_listens_path)
コード例 #10
0
 def upload_test_mapping_listens_subset_to_hdfs(cls):
     mapped_df = utils.read_files_from_HDFS(cls.mapped_listens_path)
     from_date = stats.offset_days(cls.date, 4)
     to_date = cls.date
     mapped_listens_subset = candidate_sets.get_listens_to_fetch_top_artists(
         mapped_df, from_date, to_date)
     utils.save_parquet(mapped_listens_subset,
                        cls.mapped_listens_subset_path)
コード例 #11
0
    def test_save_parquet(self):
        path_ = 'test_df.parquet'
        hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_)

        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        utils.save_parquet(df, hdfs_path)

        received_df = utils.read_files_from_HDFS(hdfs_path)
        self.assertEqual(received_df.count(), 1)
コード例 #12
0
    def upload_test_mapping_to_hdfs(cls, mapping_path):
        with open(cls.path_to_data_file('msid_mbid_mapping.json')) as f:
            data = json.load(f)

        mapping_df = None
        for row in data:
            df = utils.create_dataframe(schema.convert_mapping_to_row(row), schema=schema.msid_mbid_mapping_schema)
            mapping_df = mapping_df.union(df) if mapping_df else df

        utils.save_parquet(mapping_df, mapping_path)
コード例 #13
0
    def test_get_latest_listen_ts(self):
        date = datetime(2020, 5, 18)
        df = utils.create_dataframe(Row(listened_at=date), schema=None)
        df = df.union(
            utils.create_dataframe(Row(listened_at=offset_days(date, 7)),
                                   schema=None))
        utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_))

        result = stats_utils.get_latest_listen_ts()
        self.assertEqual(date, result)
コード例 #14
0
    def test_get_latest_full_dump_no_full(self):
        """ Test to ensure 'None' is returned if not full import has been made. """
        # Remove full dump entries from parquet
        import_meta_df = read_files_from_HDFS(self.path_)
        result = import_meta_df.filter(import_meta_df.dump_type != "full")

        # We have to save the dataframe as a different file and move it as the df itself is read from the file
        save_parquet(result, '/temp.parquet')
        delete_dir(self.path_, recursive=True)
        rename('/temp.parquet', self.path_)

        self.assertIsNone(import_utils.get_latest_full_dump())
コード例 #15
0
 def save_dataframe(self):
     df = utils.create_dataframe(Row(user_name='user2', artist_name='artist1', artist_msid='1',artist_mbids='1',
         track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1',
         release_mbid='1'), schema=None)
     df1 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',
         track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1',
          release_mbid='1'), schema=None)
     df2 = utils.create_dataframe(Row(user_name='user1',artist_name='artist1', artist_msid='1',artist_mbids='1',
         track_name='test', recording_msid='1', recording_mbid='1', release_name='test',release_msid='1',
         release_mbid='1'), schema=None)
     df = df.union(df1).union(df2)
     utils.save_parquet(df, '/data/listenbrainz/2019/12.parquet')
コード例 #16
0
    def upload_test_listen_to_hdfs(cls, listens_path):

        with open(cls.path_to_data_file('listens.json')) as f:
            data = json.load(f)

        listens_df = None
        for row in data:
            row['listened_at'] = datetime.strptime(row['listened_at'], '%d-%m-%Y')
            df = utils.create_dataframe(schema.convert_to_spark_json(row), schema=schema.listen_schema)
            listens_df = listens_df.union(df) if listens_df else df

        utils.save_parquet(listens_df, listens_path + '/{}/{}.parquet'.format(cls.date.year, cls.date.month))
コード例 #17
0
def save_dataframe(df, dest_path):
    """ Save dataframe to HDFS.

        Args:
            df : Dataframe to save.
            dest_path (str): HDFS path to save dataframe.
    """
    try:
        save_parquet(df, dest_path)
    except FileNotSavedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise
コード例 #18
0
 def upload_test_playcounts(cls):
     schema = StructType([
         StructField("user_id", IntegerType()),
         StructField("recording_id", IntegerType()),
         StructField("count", IntegerType())
     ])
     test_playcounts = []
     for i in range(1, PLAYCOUNTS_COUNT // 2 + 1):
         test_playcounts.append([1, 1, 1])
     for i in range(PLAYCOUNTS_COUNT // 2 + 1, PLAYCOUNTS_COUNT + 1):
         test_playcounts.append([2, 2, 1])
     test_playcounts_df = listenbrainz_spark.session.createDataFrame(
         test_playcounts, schema=schema)
     utils.save_parquet(test_playcounts_df, TEST_PLAYCOUNTS_PATH)
コード例 #19
0
    def process_json(self, _, dest_path, tmp_hdfs_path, __, schema):
        """ Read JSON from HDFS as a dataframe and upload to
            HDFS as a parquet.

            Args:
                dest_path (str): HDFS path to upload JSON as parquet.
                tmp_hdfs_path (str): HDFS path where JSON has been uploaded.
        """
        start_time = time.monotonic()
        df = utils.read_json(tmp_hdfs_path, schema=schema)
        logger.info("Processing {} rows...".format(df.count()))

        logger.info("Uploading to {}...".format(dest_path))
        utils.save_parquet(df, dest_path)
        logger.info("File processed in {:.2f} seconds!".format(time.monotonic() - start_time))
コード例 #20
0
    def upload_test_mapping_to_HDFS(cls):
        test_mapping = {
            "msb_recording_msid": "cb6985cd-cc71-4d59-b4fb-2e72796af741",
            "mb_recording_mbid": "3acb406f-c716-45f8-a8bd-96ca3939c2e5",
            "msb_artist_msid": "a36d6fc9-49d0-4789-a7dd-a2b72369ca45",
            "mb_artist_credit_mbids": ["181c4177-f33a-441d-b15d-910acaf18b07"],
            "mb_artist_credit_id": 2157963,
            "mb_release_mbid": "xxxxx",
            "msb_release_msid": "xxxxx"
        }

        test_mapping_df = utils.create_dataframe(
            schema.convert_mapping_to_row(test_mapping),
            schema.msid_mbid_mapping_schema)
        utils.save_parquet(test_mapping_df, MAPPING_PATH)
コード例 #21
0
    def setUp(self):
        """ Store the testdata as parquet in HDFS before each test. """
        with open(self.path_to_data_file("import_metadata.json")) as f:
            data = json.load(f)

        df = None
        for entry in data:
            row = create_dataframe(Row(dump_id=entry["dump_id"],
                                       dump_type=entry["dump_type"],
                                       imported_at=datetime.fromtimestamp(entry["imported_at"])),
                                   schema=import_metadata_schema)
            df = df.union(row) if df else row

        save_parquet(df, self.path_)

        return super().setUp()
コード例 #22
0
    def test_get_most_recent_model_id(self):
        model_id_1 = "a36d6fc9-49d0-4789-a7dd-a2b72369ca45"
        model_metadata_dict_1 = self.get_model_metadata(model_id_1)
        df_1 = utils.create_dataframe(schema.convert_model_metadata_to_row(model_metadata_dict_1),
                                      schema.model_metadata_schema)

        model_id_2 = "bbbd6fc9-49d0-4789-a7dd-a2b72369ca45"
        model_metadata_dict_2 = self.get_model_metadata(model_id_2)
        df_2 = utils.create_dataframe(schema.convert_model_metadata_to_row(model_metadata_dict_2),
                                      schema.model_metadata_schema)

        model_metadata = df_1.union(df_2)
        utils.save_parquet(model_metadata, path.RECOMMENDATION_RECORDING_MODEL_METADATA)

        expected_model_id = recommend.get_most_recent_model_id()
        self.assertEqual(expected_model_id, model_id_2)
コード例 #23
0
    def test_get_listens(self):
        from_date = datetime(2019, 10, 1)
        to_date = datetime(2019, 11, 1)
        path_ = 'test_df'
        hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_)

        df = utils.create_dataframe(Row(column1=1, column2=2), schema=None)
        dest_path = hdfs_path + '/{}/{}.parquet'.format(from_date.year, from_date.month)
        utils.save_parquet(df, dest_path)

        df = utils.create_dataframe(Row(column1=3, column2=4), schema=None)
        dest_path = hdfs_path + '/{}/{}.parquet'.format(to_date.year, to_date.month)
        utils.save_parquet(df, dest_path)

        received_df = utils.get_listens(from_date, to_date, hdfs_path)
        self.assertEqual(received_df.count(), 2)
コード例 #24
0
    def test_get_listens(self):
        from_date = datetime(2019, 10, 1)
        to_date = datetime(2019, 11, 1)

        df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        dest_path = self.path_ + '/{}/{}.parquet'.format(
            from_date.year, from_date.month)
        utils.save_parquet(df, dest_path)

        df = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        dest_path = self.path_ + '/{}/{}.parquet'.format(
            to_date.year, to_date.month)
        utils.save_parquet(df, dest_path)

        received_df = utils.get_listens(from_date, to_date, self.path_)
        self.assertEqual(received_df.count(), 2)
コード例 #25
0
    def upload_test_listen_to_HDFS(cls):
        month, year = cls.date.strftime('%m').lstrip('0'), cls.date.strftime(
            '%Y')

        test_listen = {
            "user_name": "vansika",
            "artist_msid": "a36d6fc9-49d0-4789-a7dd-a2b72369ca45",
            "artist_name": "Less Than Jake",
            "artist_mbids": [],
            "release_mbid": "",
            "track_name": "Al's War",
            "recording_msid": "cb6985cd-cc71-4d59-b4fb-2e72796af741",
            "tags": [],
            "listened_at": cls.date
        }

        test_listens_df = utils.create_dataframe(
            schema.convert_to_spark_json(test_listen), schema.listen_schema)
        utils.save_parquet(test_listens_df,
                           LISTENS_PATH + '{}/{}.parquet'.format(year, month))
コード例 #26
0
def insert_dump_data(dump_id: int, dump_type: str, imported_at: datetime):
    """ Insert information about dump imported """
    import_meta_df = None
    try:
        import_meta_df = read_files_from_HDFS(IMPORT_METADATA)
    except PathNotFoundException:
        current_app.logger.info("Import metadata file not found, creating...")

    data = create_dataframe(Row(dump_id, dump_type, imported_at), schema=import_metadata_schema)
    if import_meta_df:
        result = import_meta_df \
            .filter(f"dump_id != '{dump_id}' OR dump_type != '{dump_type}'") \
            .union(data)
    else:
        result = data

    # We have to save the dataframe as a different file and move it as the df itself is read from the file
    save_parquet(result, "/temp.parquet")
    if path_exists(IMPORT_METADATA):
        delete_dir(IMPORT_METADATA, recursive=True)
    rename("/temp.parquet", IMPORT_METADATA)
コード例 #27
0
    def upload_test_mapping_to_HDFS(cls):
        test_mapping = {
            "msb_recording_msid": "cb6985cd-cc71-4d59-b4fb-2e72796af741",
            "mb_recording_mbid": "3acb406f-c716-45f8-a8bd-96ca3939c2e5",
            "msb_artist_msid": "a36d6fc9-49d0-4789-a7dd-a2b72369ca45",
            "mb_artist_credit_mbids": ["181c4177-f33a-441d-b15d-910acaf18b07"],
            "mb_artist_credit_id": 2157963,
            "mb_release_mbid": "xxxxx",
            "msb_release_msid": "xxxxx",
            "mb_artist_credit_name": "Less Than Jake",
            "msb_artist_credit_name_matchable": "lessthanjake",
            "mb_recording_name": "Al's War",
            "msb_recording_name_matchable": "alswar",
            "mb_release_name": "Easier",
            "msb_release_name_matchable": "easier",
        }

        test_mapping_df = utils.create_dataframe(
            schema.convert_mapping_to_row(test_mapping),
            schema.msid_mbid_mapping_schema)
        utils.save_parquet(test_mapping_df, cls.mapping_path)
コード例 #28
0
    def test_copy(self):
        # Test directories
        utils.create_dir(self.path_)
        utils.create_dir(os.path.join(self.path_, "a"))
        utils.create_dir(os.path.join(self.path_, "b"))

        # DataFrames to create parquets
        df_a = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        df_b = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        df_c = utils.create_dataframe([Row(column1=5, column2=6)], schema=None)

        # Save DataFrames in respective directories
        utils.save_parquet(df_a, os.path.join(self.path_, "a", "df_a.parquet"))
        utils.save_parquet(df_b, os.path.join(self.path_, "b", "df_b.parquet"))
        utils.save_parquet(df_c, os.path.join(self.path_, "df_c.parquet"))

        utils.copy(self.path_, self.temp_path_, overwrite=True)

        # Read copied DataFrame
        cp_df_a = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "a", "df_a.parquet"))
        cp_df_b = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "b", "df_b.parquet"))
        cp_df_c = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "df_c.parquet"))

        # Check if both DataFrames are same
        self.assertListEqual(df_a.rdd.map(list).collect(), cp_df_a.rdd.map(list).collect())
        self.assertListEqual(df_b.rdd.map(list).collect(), cp_df_b.rdd.map(list).collect())
        self.assertListEqual(df_c.rdd.map(list).collect(), cp_df_c.rdd.map(list).collect())
コード例 #29
0
def save_candidate_sets(top_artists_candidate_set_df,
                        similar_artists_candidate_set_df):
    """ Save candidate sets to HDFS.

        Args:
            top_artists_candidate_set_df (dataframe): Dataframe consisting of recording ids of
                top artists listened to by a user for all the users for whom recommendations shall
                be generated. Dataframe columns can be depicted as:
                    [
                        'user_id', 'recording_id'
                    ]
            similar_artists_candidate_set_df (dataframe): Dataframe consisting of recording ids of
                artists similar to top artists listened to by a user for all the users for whom
                recommendations shall be generated. Columns can be depicted as:
                    [
                        'user_id', 'recording_id'
                    ]
    """
    utils.save_parquet(top_artists_candidate_set_df,
                       path.TOP_ARTIST_CANDIDATE_SET)
    utils.save_parquet(similar_artists_candidate_set_df,
                       path.SIMILAR_ARTIST_CANDIDATE_SET)
コード例 #30
0
    def test_process_json_listens_append(self, mock_read_json):
        fakeschema = StructType([StructField('column_1', StringType()), StructField('column_2', StringType())])

        # Save old dataframe in HDFS
        old_df = utils.create_dataframe(Row(column_1='row_a', column_2='row_a'), fakeschema)
        old_df.union(utils.create_dataframe(Row(column_1='row_b', column_2='row_b'), fakeschema))
        utils.save_parquet(old_df, os.path.join(self.path_, '/2020/1.parquet'))

        # Mock read_json to return new dataframe
        new_df = utils.create_dataframe(Row(column_1='row_c', column_2='row_c'), fakeschema)
        mock_read_json.return_value = new_df

        ListenbrainzDataUploader().process_json_listens('/2020/1.json', self.path_, self.path_, append=True, schema=fakeschema)

        received = utils.read_files_from_HDFS(os.path.join(self.path_, '/2020/1.parquet')) \
            .rdd \
            .map(list) \
            .collect()

        old_df.union(new_df)
        expected = old_df.rdd.map(list).collect()

        self.assertCountEqual(received, expected)