def run(dataset, item_sim, keys):
    """
    Main function. Take as input the GloVe embeddings, create clusters
    with similar words, and then plot using the t-sne algorithm.
    """
    glove_embs = os.path.join("../data", "tag_embeds", dataset + '.txt')
    pp = PreProcessing()

    DictEmbeds, DictKeyEmbeds = pp.import_embeddings(glove_embs)
    a_idx = pp.create_annoy_idx(DictEmbeds)

    embedding_clusters, word_clusters = create_clusters(
        DictEmbeds, DictKeyEmbeds, keys, a_idx, item_sim)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape

    tsne_model_en_2d = TSNE(perplexity=15,
                            n_components=2,
                            init='pca',
                            n_iter=3500,
                            random_state=32)

    embeddings_en_2d = np.array(
        tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m,
                                                                  k))).reshape(
                                                                      n, m, 2)

    tsne_plot_similar_words('',
                            keys,
                            embeddings_en_2d,
                            word_clusters,
                            0.7,
                            filename=None)
    def __init__(self, dataset, min_tracks):
        """
        """
        # Inizialize classes
        self.pp = PreProcessing()
        self.st = Stats()

        # Define input files
        IN_DIR = os.path.join("../data", dataset)
        playlists_file = os.path.join(IN_DIR, "playlists.tsv")
        tracklist_file = os.path.join(IN_DIR, "tracklist.tsv")
        glove_embs = os.path.join("../data", "tag_embeds", dataset + ".txt")
        lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv")

        # Import data
        DictEmbeds, self.DictKeyEmbeds = self.pp.import_embeddings(glove_embs)
        self.a_idx = self.pp.create_annoy_idx(DictEmbeds)
        self.DictTrackTag = self.pp.import_track_tags(lastfm_tags)
        self.playlists = self.pp.filter_playlists(
            self.pp.import_playlists(playlists_file, min_tracks))
        self.DictTrack = self.pp.import_tracklist(tracklist_file)

        # Define variables
        self.low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv')
        self.high_pDI_playlists = os.path.join(IN_DIR,
                                               'high_pDI_playlists.tsv')
        self.rand_tracks_playlist = []
Ejemplo n.º 3
0
def retrain():
    if request.method == 'POST':
        data = request.get_json()

        try:
            training_set = joblib.load(
                "C:/Users/datta/Desktop/flask_api/notebooks/training_data.pkl")
            training_labels = joblib.load(
                "C:/Users/datta/Desktop/flask_api/notebooks/training_labels.pkl"
            )

            df = pd.read_json(data)

            df_training_set = df.drop(["Loan_Status"], axis=1)
            df_training_labels = df["Loan_Status"]

            df_training_set = pd.concat([training_set, df_training_set])
            df_training_labels = pd.concat(
                [training_labels, df_training_labels])

            pipe = make_pipeline(PreProcessing(), RandomForestClassifier())

            new_param_grid = {"randomforestclassifier__n_estimators" : [10, 20, 30],\
                 "randomforestclassifier__max_depth" : [None, 6, 8, 10],\
                 "randomforestclassifier__max_leaf_nodes": [None, 5, 10, 20], \
                 "randomforestclassifier__min_impurity_split": [0.1, 0.2, 0.3]}

            new_grid = GridSearchCV(pipe, param_grid=new_param_grid, cv=3)

            new_grid.fit(df_training_set, df_training_labels)

            os.remove(
                "C:/Users/datta/Desktop/flask_api/notebooks/finalized_model.pkl"
            )
            os.remove(
                "C:/Users/datta/Desktop/flask_api/notebooks/training_data.pkl")
            os.remove(
                "C:/Users/datta/Desktop/flask_api/notebooks/training_labels.pkl"
            )

            joblib.dump(
                new_grid,
                "C:/Users/datta/Desktop/flask_api/notebooks/finalized_model.pkl"
            )
            joblib.dump(
                df_training_set,
                "C:/Users/datta/Desktop/flask_api/notebooks/training_data.pkl")
            joblib.dump(
                df_training_labels,
                "C:/Users/datta/Desktop/flask_api/notebooks/training_labels.pkl"
            )

            rf_model = joblib.load(
                "C:/Users/datta/Desktop/flask_api/notebooks/finalized_model.pkl"
            )
        except ValueError as e:
            return jsonify("Error when retraining - {}".format(e))

        return jsonify("Retrained model successfully.")
def run(infile, min_tracks):
    """
    Main function. It imports the playlists from the dataset, filtering out
    the outliers. Then, it perform the track popularity analysis and the
    playlist popularity analysis
    """
    pp = PreProcessing()
    playlists = pp.filter_playlists(
        pp.import_playlists(infile, args.min_tracks))

    avg_playlist_len = int(np.mean([len(x) for x in playlists]))
    logging.info("Avg playlist lenght: {}".format(avg_playlist_len))

    DictTrackPop, min_pop, max_pop = track_popularity_analysis(playlists)

    playlist_popularity_analysis(playlists, DictTrackPop, min_pop, max_pop,
                                 avg_playlist_len)
def run(dataset):
    """
    Convert file with list of tags to a single line file with tags 
    concatenede, used for computing the GloVe embeddings.
    """
    pp = PreProcessing()

    # Define input files
    IN_DIR = os.path.join("../data", dataset)
    tracklist_file = os.path.join(IN_DIR, 'tracklist.tsv')
    lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv")
    out_token = os.path.join(IN_DIR, "tracks_tags_token.txt")

    # Import data
    DictTrackTag = pp.import_track_tags(lastfm_tags)

    count_row = 0
    count_row_wrong = 0

    with open(tracklist_file, 'r+') as inf, open(out_token, 'w+') as outf:
        _reader = csv.reader(inf, delimiter='\t')
        _writer = csv.writer(outf, delimiter=' ')
        tag_final = []

        for row in _reader:
            # Skip rows bad-formatted
            if len(row) != 3:
                count_row_wrong += 1
                continue

            idx, artist_name, track_name = row

            if artist_name and track_name:
                artist_name = pp.norm_str(artist_name)
                track_name = pp.norm_str(track_name)

                key = '|'.join([artist_name, track_name])

                if key in DictTrackTag:
                    if DictTrackTag[key]:
                        count_row += 1
                        tags_row = [
                            pp.norm_str(x[0]) for x in DictTrackTag[key]
                            if x[0] != 'n'
                        ]
                        if tags_row:
                            tag_final += tags_row

        _writer.writerow(tag_final)

    logging.info("Rows processed: {}/{}".format(count_row - count_row_wrong,
                                                count_row))
Ejemplo n.º 6
0
def train(train_hr, train_lr, test_hr, test_lr, epochs=100):
    for epoch in range(epochs):
        for tr_hr, tr_lr in zip(train_hr, train_lr):
            tr_hr = tf.expand_dims(tr_hr, axis=0)
            tr_lr = tf.expand_dims(tr_lr, axis=0)
            train_step(tr_hr, tr_lr)
            print("Train Step Completed")

        if (epoch + 1) % 1 == 0:
            checkpoint.save(file_prefix=checkpoint_prefix)


print("Loading Images")
images = load_from_path('./dataset/')
print("Images Loaded from Path")

pre_processing = PreProcessing(downscale_factor=4)

images = pre_processing.normalize_images(images)
print("Images Normalized")

train_hr, test_hr = split_into_train_test(images)
print("Divided into Train Test")

train_lr = pre_processing.convert_high_resolution_to_low_resolution(train_hr)
test_lr = pre_processing.convert_high_resolution_to_low_resolution(test_hr)
print("Low Res images Created")

print("Started Training")
train(train_hr, train_lr, test_hr, test_lr)
class PlaylistDI(object):
    """
    Perform playlist diversity analysis. 
    """
    def __init__(self, dataset, min_tracks):
        """
        """
        # Inizialize classes
        self.pp = PreProcessing()
        self.st = Stats()

        # Define input files
        IN_DIR = os.path.join("../data", dataset)
        playlists_file = os.path.join(IN_DIR, "playlists.tsv")
        tracklist_file = os.path.join(IN_DIR, "tracklist.tsv")
        glove_embs = os.path.join("../data", "tag_embeds", dataset + ".txt")
        lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv")

        # Import data
        DictEmbeds, self.DictKeyEmbeds = self.pp.import_embeddings(glove_embs)
        self.a_idx = self.pp.create_annoy_idx(DictEmbeds)
        self.DictTrackTag = self.pp.import_track_tags(lastfm_tags)
        self.playlists = self.pp.filter_playlists(
            self.pp.import_playlists(playlists_file, min_tracks))
        self.DictTrack = self.pp.import_tracklist(tracklist_file)

        # Define variables
        self.low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv')
        self.high_pDI_playlists = os.path.join(IN_DIR,
                                               'high_pDI_playlists.tsv')
        self.rand_tracks_playlist = []

    def tag_distance(self, word1, word2):
        """
        Compute pairwise cosine distance between two tags, from the Annoy index.
        If tags are not in the index, return -1
        """
        word1 = self.pp.norm_str(word1)
        word2 = self.pp.norm_str(word2)
        try:
            dist = self.a_idx.get_distance(self.DictKeyEmbeds[word1],
                                           self.DictKeyEmbeds[word2])
        except KeyError:
            dist = -1

        return dist

    def TT_distance(self, v1, v2):
        """
        Compute Track-Tag distance between two tracks. If tracks have not the 
        same number of tags, return -1. 
        """
        if not v1 or not v2:
            return -1

        max_len = max(len(v1), len(v2))

        # TODO ### improve propagation when incomplete information
        if len(v1) < max_len:
            return -1
        elif len(v2) < max_len:
            return -1

        s = 0
        for i in range(max_len):
            max_weight = max(v1[i][1], v2[i][1])

            if max_weight == 0:
                s += 0
                max_len += -1
            else:
                dist = self.tag_distance(v1[i][0], v2[i][0])
                if dist == -1:
                    s += 0
                    max_len += -1
                else:
                    s += ((v1[i][1] + v2[i][1]) / float(2 * max_weight) * dist)

        if max_len == 0:
            return -1

        return (s / float(max_len))

    def log_results(self, results):
        """
        Print results of the diversity analysis.
        """
        logging.info("Mean pDI: {}".format(np.mean(results)))
        logging.info("Std pDI: {}".format(np.std(results)))
        logging.info("Max pDI: {}".format(max(results)))
        logging.info("Min pDI: {}".format(min(results)))
        logging.info("Gini pDI: {}".format(gini(np.array(results))))
        logging.info("QCD pDI: {}".format(self.st.qcd(results)))

    def analyze_playlist(self):
        """
        Analyze diversity of playlists from the dataset.
        """
        logging.info("Analyzing Playlists...")
        pDI = []
        pDI_idx = []
        playlist_analyzed = 0

        for c, playlist in enumerate(self.playlists):
            playlist_track_tags = []
            playlist_tracks_tags_count = 0
            for track in playlist:
                track = str(track).strip()
                try:
                    # Continue if track has at least 1 tag associated
                    if self.DictTrackTag[self.DictTrack[track]]:
                        playlist_track_tags.append(
                            self.DictTrackTag[self.DictTrack[track]])
                        playlist_tracks_tags_count += 1

                        # Get random tracks for evaluation
                        if random.randint(0, 9) > 5:
                            self.rand_tracks_playlist.append(
                                self.DictTrackTag[self.DictTrack[track]])
                # Skip if tracks has not tags associated
                except KeyError:
                    pass

            # Skip playlist without complete information
            if playlist_tracks_tags_count >= int(1 * len(playlist)):
                playlist_analyzed += 1
                pDI_sum = 0

                tracks_comb = list(
                    itertools.combinations(playlist_track_tags, 2))

                for track_tags in tracks_comb:
                    dist = self.TT_distance(track_tags[0], track_tags[1])
                    if dist == -1:
                        tracks_comb.remove(track_tags)
                    else:
                        pDI_sum += dist
                if pDI_sum == 0:
                    pass
                else:
                    pDI.append(pDI_sum / float(len(tracks_comb)))
                    pDI_idx.append(c)

        self.log_results(pDI)

        logging.info("Playlists analyzed: {}/{}".format(
            playlist_analyzed, len(self.playlists)))

        return pDI, pDI_idx

    def analyze_random_playlist(self):
        """
        Analyze diversity of random playlists created
        with tracks from the dataset.
        """
        logging.info("Analyzing Random Playlists...")
        playlist_len_mean = int(np.mean([len(x) for x in self.playlists]))

        k = 0
        while k < 1:
            # Shuffle tracks at each iteration
            rand_tracks_playlist = random.sample(
                self.rand_tracks_playlist, len(self.rand_tracks_playlist))

            rand_pDI = []
            random_playlists = [
                rand_tracks_playlist[x:x + playlist_len_mean]
                for x in range(0, len(rand_tracks_playlist), playlist_len_mean)
            ]

            for el in random_playlists:
                rand_pDI_sum = 0
                tracks_comb = list(itertools.combinations(el, 2))
                for track_tags in tracks_comb:
                    dist = self.TT_distance(track_tags[0], track_tags[1])

                    if dist == -1:
                        tracks_comb.remove(track_tags)
                    else:
                        rand_pDI_sum += dist

                if tracks_comb:
                    if rand_pDI_sum == 0:
                        pass
                    else:
                        rand_pDI.append(rand_pDI_sum / float(len(tracks_comb)))

            self.log_results(rand_pDI)
            k += 1

    def write_playlist_qualia(self, pDI, pDI_idx):
        """
        Write out the files with the playlists for performing 
        the qualitative analysis
        """
        dist_10pct = int(0.1 * len(pDI))
        # Write most similar playlists
        with open(self.low_pDI_playlists, 'w+') as outf:
            _writer = csv.writer(outf, delimiter='\t')
            for idx in sorted(range(len(pDI)),
                              key=lambda i: pDI[i],
                              reverse=False)[:dist_10pct]:
                row = [pDI[idx], self.playlists[pDI_idx[idx]]]
                _writer.writerow(row)

        # Write less similar playlists
        with open(self.high_pDI_playlists, 'w+') as outf:
            _writer = csv.writer(outf, delimiter='\t')
            for idx in sorted(range(len(pDI)),
                              key=lambda i: pDI[i],
                              reverse=True)[:dist_10pct]:
                row = [pDI[idx], self.playlists[pDI_idx[idx]]]
                _writer.writerow(row)

    def run(self):
        """
        Main function. It performs the diversity on the playlists from the 
        dataset, then on random playlists. Finally, it writes the files with
        the playlists for performing a qualitative analysis
        """
        pDI, pDI_idx = self.analyze_playlist()
        self.analyze_random_playlist()
        self.write_playlist_qualia(pDI, pDI_idx)
Ejemplo n.º 8
0
def run(dataset, print_ex):
    """
    Analyze the 10% of playlists with the highest, and
    the 10% with the lowest diversity index.
    """
    pp = PreProcessing()
    st = Stats()

    # Define input files
    IN_DIR = os.path.join("../data", dataset)
    tracklist_file = os.path.join(IN_DIR, "tracklist.tsv")
    lastfm_tags = os.path.join("../data", "lastfm_tags", "lastfm_tags.tsv")

    # Import data
    DictTrackTag = pp.import_track_tags(lastfm_tags)
    DictTrack = pp.import_tracklist(tracklist_file)
    low_pDI_playlists = os.path.join(IN_DIR, 'low_pDI_playlists.tsv')
    high_pDI_playlists = os.path.join(IN_DIR, 'high_pDI_playlists.tsv')

    results_pd = []

    for input_file in [low_pDI_playlists, high_pDI_playlists]:

        # Initialize variables
        tag_no = []
        tag_common = []
        ratio_tag_track = []
        artist_no = []
        tracks_no = []
        ratio_track_art = []
        distances = []
        print_c = 0
        playlist_c = 0

        with open(input_file, 'r') as inf:
            _reader = csv.reader(inf, delimiter='\t')

            # Iterate over playlists
            for row in _reader:
                playlist_c += 1
                dist, playlist = row
                playlist = eval(playlist)
                distances.append(float(dist))

                artistnames = set()
                total_tags = set()
                tags_list = []

                # Print playlist info
                if print_c < print_ex:
                    logging.info("Printing info new playlist...")
                    logging.info("Playlist pDI:{}".format(dist))
                    logging.info("Playlist Tracks:")

                # Iterate over playlist tracks
                for track in playlist:
                    track = str(track)
                    try:
                        artistname, trackname = DictTrack[track].split("|")
                    except ValueError:
                        continue
                    artistnames.add(artistname)
                    tags_tracks = set()

                    if DictTrack[track] in DictTrackTag:
                        for tag in DictTrackTag[DictTrack[track]]:
                            total_tags.add(pp.norm_str(tag[0]))
                            tags_tracks.add(pp.norm_str(tag[0]))

                        tags_list.append(tags_tracks)
                        if print_c < print_ex:
                            logging.info("{} {}".format(
                                DictTrack[track],
                                DictTrackTag[DictTrack[track]]))
                    else:
                        tags_list.append(set())
                        continue

                # Print playlist stats
                if print_c < print_ex:
                    logging.info("No. unique tags: {}".format(len(total_tags)))
                    logging.info("No. unique tags for tracks: {}".format(
                        len(total_tags) / float(len(playlist))))
                    logging.info("No. unique artists: {}".format(
                        len(artistnames)))
                    logging.info("No. unique tracks: {}".format(len(playlist)))
                    logging.info("No. unique tracks for artists: {}".format(
                        len(playlist) / float(len(artistnames))))

                print_c += 1

                tag_no.append(len(total_tags))
                ratio_tag_track.append(len(total_tags) / float(len(playlist)))
                artist_no.append(len(artistnames))
                tracks_no.append(len(playlist))
                ratio_track_art.append(len(playlist) / float(len(artistnames)))
                tag_common.append(set.intersection(*tags_list))

            common_tags = round(
                len([x for x in tag_common if len(x) > 1]) * 100 /
                float(playlist_c))
            single_artists = round(
                len([x
                     for x in artist_no if x == 1]) * 100 / float(playlist_c))

            # Print playlist dataset qualitative analysis results
            logging.info("")
            logging.info(
                "## Qualitative analysis of playlists from {} file ## ".format(
                    input_file))
            logging.info("Average pDI: {}".format(np.mean(distances)))
            logging.info("Average tag count: {}".format(round(
                np.mean(tag_no))))
            logging.info("Common tags(%): {}".format(common_tags))
            logging.info("Average tag over tracks: {}".format(
                round(np.mean(ratio_tag_track))))
            logging.info("Average artist count: {}".format(
                round(np.mean(artist_no))))
            logging.info("Single-artist(%): {}".format(single_artists))
            logging.info("Average tracks count: {}".format(
                round(np.mean(tracks_no))))
            logging.info("Average tracks over artists: {}".format(
                round(np.mean(ratio_track_art))))

            # Store results for computing Percentual Difference
            results = [
                np.mean(distances),
                round(np.mean(tag_no)), common_tags,
                round(np.mean(ratio_tag_track)),
                round(np.mean(artist_no)), single_artists,
                round(np.mean(tracks_no)),
                round(np.mean(ratio_track_art))
            ]

            results_pd.append(results)

    logging.info("")
    logging.info("## Percentage Difference (PD) ## ".format(input_file))
    for c in range(0, 8):
        if c not in [2, 5]:
            logging.info(st.pdiff(results_pd[0][c], results_pd[1][c]))
        else:
            logging.info(abs(results_pd[0][c] - results_pd[1][c]))