def smarter_topkstems_feature_vectors(video_ids, k=50):
    custom_ignore_stems = ["com"]  # stems to ignore across all channels

    titles = query_videos("SELECT id, channelTitle, title FROM videos;")

    vid_to_stems = {}

    for vid_tuple in titles:
        vid, channelTitle, title = vid_tuple
        if vid in video_ids:
            channelTitleProcessed = process_title(channelTitle)
            title_stems = process_title(title,
                                        stems_to_ignore=custom_ignore_stems +
                                        channelTitleProcessed)
            vid_to_stems[vid] = title_stems

    # collapse all values in vid_to_stems to a single list of all stems
    stems_list = [
        stem for stemlist in vid_to_stems.values() for stem in stemlist
    ]
    top_stems = Counter(stems_list).most_common(k)

    # create the vectors!
    feature_vecs = []
    for stem in top_stems:
        # binary feature of whether a given word is in the stems
        feature_vec = [1 if stem in vid_to_stems[v] else 0 for v in video_ids]
        feature_vecs.append(feature_vec)

    return feature_vecs
Beispiel #2
0
def feature_vector__plain_duration(video_ids=None):
    duration_results = query_videos(
        "SELECT id, duration, viewCount FROM videos WHERE viewCount IS NOT NULL;"
    )
    duration_objects = interpret_query_results(duration_results)
    video_id_objects = filter_durations(duration_objects, video_ids=video_ids)

    feature_vector = [d.duration for d in video_id_objects]
    return [feature_vector]
def category_dict():
    # Returns:
    #  Dictionary in the form videoid -> duration in seconds

    category_dict = {}
    category_results = query_videos("SELECT id, categoryId FROM videos;")

    for r_tuple in category_results:
        vid, cat = r_tuple
        category_dict[vid] = cat

    return category_dict
def title_topkstems(video_ids, k=10):
    # Returns the top k stems from all relevant video titles

    titles = query_videos("SELECT id, title FROM videos;")
    # assumes titles is a list of tuples, where each tuple contains the title at index 0
    stems_list = []

    for t_tuple in titles:
        vid, t = t_tuple
        if vid in video_ids:
            stems_list.extend(process_title(t))

    return Counter(stems_list).most_common(k)
Beispiel #5
0
def feature_vector__distance_to_peak(video_ids=None):
    duration_results = query_videos(
        "SELECT id, duration, viewCount FROM videos WHERE viewCount IS NOT NULL;"
    )
    duration_objects = interpret_query_results(duration_results)
    video_id_objects = filter_durations(duration_objects, video_ids=video_ids)
    # print("testing feature vector,len(duration_objects), video_id_objects)

    peak_x, _ = peak_point(video_id_objects)

    feature_vector = [
        abs(peak_x - video.duration) for video in video_id_objects
    ]
    return [feature_vector]
def feature_vector(video_ids):
    category_vector = []

    id_name = {
        1: [],
        2: [],
        10: [],
        15: [],
        17: [],
        18: [],
        19: [],
        20: [],
        21: [],
        22: [],
        23: [],
        24: [],
        25: [],
        26: [],
        27: [],
        28: [],
        29: [],
        30: [],
        31: [],
        32: [],
        33: [],
        34: [],
        35: [],
        36: [],
        37: [],
        38: [],
        39: [],
        40: [],
        41: [],
        42: [],
        43: [],
        44: []
    }

    category_results = query_videos("SELECT id, categoryId FROM videos;")

    for r_tuple in category_results:
        vid, cat = r_tuple
        for id in id_names:
            if cat == id:
                id_name[id].append(1)
            else:
                id_name[id].append(0)

    return list(id_name.values())
def channeltitle_topkstems(video_ids, k=10, remove_single_occ=True):
    # returns the top stems from all (unique) channel titles; returns no more than k stems.
    # by default, returns only stems that have occurence greater than 1, so there may be less than k stems

    # > has to be unique bc then that'll just return stems of channel names w most uploads
    titles = query_videos("SELECT DISTINCT channelTitle FROM videos;")
    # assumes titles is a list of tuples, where each tuple contains the title at index 0
    stems_list = []

    for t_tuple in titles:
        t = t_tuple[0]
        stems_list.extend(process_title(t))

    kmostcommon = Counter(stems_list).most_common(k)
    if not remove_single_occ:
        return kmostcommon
    else:
        return [(stem, count) for stem, count in kmostcommon if count > 1]
def smarter_topkstems(video_ids, k=10):
    # Like topkstems, returns the top k stems from all relevant video titles,
    # but omits the channel title from the considered stems

    custom_ignore_stems = ["com"]  # stems to ignore across all channels

    titles = query_videos("SELECT id, channelTitle, title FROM videos;")
    # assumes titles is a list of tuples, where each tuple contains the title at index 0
    stems_list = []

    for vid_tuple in titles:
        vid, channelTitle, title = vid_tuple
        if vid in video_ids:
            channelTitleProcessed = process_title(channelTitle)
            stems_list.extend(
                process_title(title,
                              stems_to_ignore=custom_ignore_stems +
                              channelTitleProcessed))

    return Counter(stems_list).most_common(k)
def published_dict():
    # Returns:
    #  Dictionary in the form published time -> video count

    published_dict = {}
    published_results = query_videos("SELECT id, publishedAt FROM videos;")

    for r_tuple in published_results:
        vid, published = r_tuple
        published_time = parser.parse(published).time()
        if published_time < datetime.time(1, 0, 0):
            if "00:00-00:59" in published_dict:
                published_dict["00:00-00:59"] += 1
            else:
                published_dict["00:00-00:59"] = 1
        elif published_time < datetime.time(2, 0, 0):
            if "01:00-01:59" in published_dict:
                published_dict["01:00-01:59"] += 1
            else:
                published_dict["01:00-01:59"] = 1
        elif published_time < datetime.time(3, 0, 0):
            if "02:00-02:59" in published_dict:
                published_dict["02:00-02:59"] += 1
            else:
                published_dict["02:00-02:59"] = 1
        elif published_time < datetime.time(4, 0, 0):
            if "03:00-03:59" in published_dict:
                published_dict["03:00-03:59"] += 1
            else:
                published_dict["03:00-03:59"] = 1
        elif published_time < datetime.time(5, 0, 0):
            if "04:00-04:59" in published_dict:
                published_dict["04:00-04:59"] += 1
            else:
                published_dict["04:00-04:59"] = 1
        elif published_time < datetime.time(6, 0, 0):
            if "05:00-05:59" in published_dict:
                published_dict["05:00-05:59"] += 1
            else:
                published_dict["05:00-05:59"] = 1
        elif published_time < datetime.time(7, 0, 0):
            if "06:00-06:59" in published_dict:
                published_dict["06:00-06:59"] += 1
            else:
                published_dict["06:00-06:59"] = 1
        elif published_time < datetime.time(8, 0, 0):
            if "07:00-07:59" in published_dict:
                published_dict["07:00-07:59"] += 1
            else:
                published_dict["07:00-07:59"] = 1
        elif published_time < datetime.time(9, 0, 0):
            if "08:00-08:59" in published_dict:
                published_dict["08:00-08:59"] += 1
            else:
                published_dict["08:00-08:59"] = 1
        elif published_time < datetime.time(10, 0, 0):
            if "09:00-09:59" in published_dict:
                published_dict["09:00-09:59"] += 1
            else:
                published_dict["09:00-09:59"] = 1
        elif published_time < datetime.time(11, 0, 0):
            if "10:00-10:59" in published_dict:
                published_dict["10:00-10:59"] += 1
            else:
                published_dict["10:00-10:59"] = 1
        elif published_time < datetime.time(12, 0, 0):
            if "11:00-11:59" in published_dict:
                published_dict["11:00-11:59"] += 1
            else:
                published_dict["11:00-11:59"] = 1
        elif published_time < datetime.time(13, 0, 0):
            if "12:00-12:59" in published_dict:
                published_dict["12:00-12:59"] += 1
            else:
                published_dict["12:00-12:59"] = 1
        elif published_time < datetime.time(14, 0, 0):
            if "13:00-13:59" in published_dict:
                published_dict["13:00-13:59"] += 1
            else:
                published_dict["13:00-13:59"] = 1
        elif published_time < datetime.time(15, 0, 0):
            if "14:00-14:59" in published_dict:
                published_dict["14:00-14:59"] += 1
            else:
                published_dict["14:00-14:59"] = 1
        elif published_time < datetime.time(16, 0, 0):
            if "15:00-15:59" in published_dict:
                published_dict["15:00-15:59"] += 1
            else:
                published_dict["15:00-15:59"] = 1
        elif published_time < datetime.time(17, 0, 0):
            if "16:00-16:59" in published_dict:
                published_dict["16:00-16:59"] += 1
            else:
                published_dict["16:00-16:59"] = 1
        elif published_time < datetime.time(18, 0, 0):
            if "17:00-17:59" in published_dict:
                published_dict["17:00-17:59"] += 1
            else:
                published_dict["17:00-17:59"] = 1
        elif published_time < datetime.time(19, 0, 0):
            if "18:00-18:59" in published_dict:
                published_dict["18:00-18:59"] += 1
            else:
                published_dict["18:00-18:59"] = 1
        elif published_time < datetime.time(20, 0, 0):
            if "19:00-19:59" in published_dict:
                published_dict["19:00-19:59"] += 1
            else:
                published_dict["19:00-19:59"] = 1
        elif published_time < datetime.time(21, 0, 0):
            if "20:00-20:59" in published_dict:
                published_dict["20:00-20:59"] += 1
            else:
                published_dict["20:00-20:59"] = 1
        elif published_time < datetime.time(22, 0, 0):
            if "21:00-21:59" in published_dict:
                published_dict["21:00-21:59"] += 1
            else:
                published_dict["21:00-21:59"] = 1
        elif published_time < datetime.time(23, 0, 0):
            if "22:00-22:59" in published_dict:
                published_dict["22:00-22:59"] += 1
            else:
                published_dict["22:00-22:59"] = 1
        elif published_time <= datetime.time(23, 59, 59):
            if "23:00-23:59" in published_dict:
                published_dict["23:00-23:59"] += 1
            else:
                published_dict["23:00-23:59"] = 1

    return published_dict
def duration_plot_averages():
    # No params or return
    # Plots durations vs average viewCount for that duration,
    # and then uses polyfit to draw a polynomial line representing this curve

    published_results = query_videos(
        "SELECT id, publishedAt, viewCount FROM videos WHERE viewCount IS NOT NULL;"
    )

    published_dict = {}

    for r_tuple in published_results:
        id, publishedAt, viewCount = r_tuple
        published_time = parser.parse(publishedAt).time()
        if published_time < datetime.time(1, 0, 0):
            if "00:00-00:59" in published_dict:
                published_dict["00:00-00:59"].append(viewCount)
            else:
                published_dict["00:00-00:59"] = [viewCount]
        elif published_time < datetime.time(2, 0, 0):
            if "01:00-01:59" in published_dict:
                published_dict["01:00-01:59"].append(viewCount)
            else:
                published_dict["01:00-01:59"] = [viewCount]
        elif published_time < datetime.time(3, 0, 0):
            if "02:00-02:59" in published_dict:
                published_dict["02:00-02:59"].append(viewCount)
            else:
                published_dict["02:00-02:59"] = [viewCount]
        elif published_time < datetime.time(4, 0, 0):
            if "03:00-03:59" in published_dict:
                published_dict["03:00-03:59"].append(viewCount)
            else:
                published_dict["03:00-03:59"] = [viewCount]
        elif published_time < datetime.time(5, 0, 0):
            if "04:00-04:59" in published_dict:
                published_dict["04:00-04:59"].append(viewCount)
            else:
                published_dict["04:00-04:59"] = [viewCount]
        elif published_time < datetime.time(6, 0, 0):
            if "05:00-05:59" in published_dict:
                published_dict["05:00-05:59"].append(viewCount)
            else:
                published_dict["05:00-05:59"] = [viewCount]
        elif published_time < datetime.time(7, 0, 0):
            if "06:00-06:59" in published_dict:
                published_dict["06:00-06:59"].append(viewCount)
            else:
                published_dict["06:00-06:59"] = [viewCount]
        elif published_time < datetime.time(8, 0, 0):
            if "07:00-07:59" in published_dict:
                published_dict["07:00-07:59"].append(viewCount)
            else:
                published_dict["07:00-07:59"] = [viewCount]
        elif published_time < datetime.time(9, 0, 0):
            if "08:00-08:59" in published_dict:
                published_dict["08:00-08:59"].append(viewCount)
            else:
                published_dict["08:00-08:59"] = [viewCount]
        elif published_time < datetime.time(10, 0, 0):
            if "09:00-09:59" in published_dict:
                published_dict["09:00-09:59"].append(viewCount)
            else:
                published_dict["09:00-09:59"] = [viewCount]
        elif published_time < datetime.time(11, 0, 0):
            if "10:00-10:59" in published_dict:
                published_dict["10:00-10:59"].append(viewCount)
            else:
                published_dict["10:00-10:59"] = [viewCount]
        elif published_time < datetime.time(12, 0, 0):
            if "11:00-11:59" in published_dict:
                published_dict["11:00-11:59"].append(viewCount)
            else:
                published_dict["11:00-11:59"] = [viewCount]
        elif published_time < datetime.time(13, 0, 0):
            if "12:00-12:59" in published_dict:
                published_dict["12:00-12:59"].append(viewCount)
            else:
                published_dict["12:00-12:59"] = [viewCount]
        elif published_time < datetime.time(14, 0, 0):
            if "13:00-13:59" in published_dict:
                published_dict["13:00-13:59"].append(viewCount)
            else:
                published_dict["13:00-13:59"] = [viewCount]
        elif published_time < datetime.time(15, 0, 0):
            if "14:00-14:59" in published_dict:
                published_dict["14:00-14:59"].append(viewCount)
            else:
                published_dict["14:00-14:59"] = [viewCount]
        elif published_time < datetime.time(16, 0, 0):
            if "15:00-15:59" in published_dict:
                published_dict["15:00-15:59"].append(viewCount)
            else:
                published_dict["15:00-15:59"] = [viewCount]
        elif published_time < datetime.time(17, 0, 0):
            if "16:00-16:59" in published_dict:
                published_dict["16:00-16:59"].append(viewCount)
            else:
                published_dict["16:00-16:59"] = [viewCount]
        elif published_time < datetime.time(18, 0, 0):
            if "17:00-17:59" in published_dict:
                published_dict["17:00-17:59"].append(viewCount)
            else:
                published_dict["17:00-17:59"] = [viewCount]
        elif published_time < datetime.time(19, 0, 0):
            if "18:00-18:59" in published_dict:
                published_dict["18:00-18:59"].append(viewCount)
            else:
                published_dict["18:00-18:59"] = [viewCount]
        elif published_time < datetime.time(20, 0, 0):
            if "19:00-19:59" in published_dict:
                published_dict["19:00-19:59"].append(viewCount)
            else:
                published_dict["19:00-19:59"] = [viewCount]
        elif published_time < datetime.time(21, 0, 0):
            if "20:00-20:59" in published_dict:
                published_dict["20:00-20:59"].append(viewCount)
            else:
                published_dict["20:00-20:59"] = [viewCount]
        elif published_time < datetime.time(22, 0, 0):
            if "21:00-21:59" in published_dict:
                published_dict["21:00-21:59"].append(viewCount)
            else:
                published_dict["21:00-21:59"] = [viewCount]
        elif published_time < datetime.time(23, 0, 0):
            if "22:00-22:59" in published_dict:
                published_dict["22:00-22:59"].append(viewCount)
            else:
                published_dict["22:00-22:59"] = [viewCount]
        elif published_time <= datetime.time(23, 59, 59):
            if "23:00-23:59" in published_dict:
                published_dict["23:00-23:59"].append(viewCount)
            else:
                published_dict["23:00-23:59"] = [viewCount]

    published_to_viewavg = {
        published: sum(viewlist) / len(viewlist)
        for published, viewlist in published_dict.items()
    }

    x_avg = []
    y_avg = []
    for key, value in sorted(published_to_viewavg.items(),
                             key=lambda item: item[0]):
        x_avg.append(key)
        y_avg.append(value)
        # print("%s: %s" % (key, value))

    # x_avg = list(published_dict.keys())
    # y_avg = [published_to_viewavg[published] for published in x_avg]

    plt.bar(np.arange(len(x_avg)),
            np.array(y_avg),
            align='center',
            color='#2B8CBF')
    plt.xticks(np.arange(len(x_avg)), np.array(x_avg), rotation='vertical')

    plt.title("Average Views per Video for Each Published Time")
    plt.xlabel('Published Time')
    plt.ylabel('Average View Counts')

    plt.show()
Beispiel #11
0
    )
    duration_objects = interpret_query_results(duration_results)
    video_id_objects = filter_durations(duration_objects, video_ids=video_ids)
    # print("testing feature vector,len(duration_objects), video_id_objects)

    peak_x, _ = peak_point(video_id_objects)

    feature_vector = [
        abs(peak_x - video.duration) for video in video_id_objects
    ]
    return [feature_vector]


if __name__ == '__main__':
    duration_results = query_videos(
        "SELECT id, duration, viewCount FROM videos WHERE viewCount IS NOT NULL;"
    )
    duration_objects = interpret_query_results(duration_results)
    filtered_objects = filter_durations(duration_objects,
                                        dur_cutoff=4600,
                                        views_cutoff=None)
    grouped_durations = grouped_durations(filtered_objects, max_duration=4600)

    # Generate Points:
    x_scatter, y_scatter = points_from_durations(filtered_objects)
    x_grouped, y_grouped = points_from_durations(grouped_durations)
    x_poly, y_poly = points_for_polynomial_curve(grouped_durations)
    peak_x, peak_y = peak_point(grouped_durations)

    # Plot Points
    plt.title('Video Duration x Average Number of Views')
    return feature_vecs


def channeltitle_topkstems(video_ids, k=10, remove_single_occ=True):
    # returns the top stems from all (unique) channel titles; returns no more than k stems.
    # by default, returns only stems that have occurence greater than 1, so there may be less than k stems

    # > has to be unique bc then that'll just return stems of channel names w most uploads
    titles = query_videos("SELECT DISTINCT channelTitle FROM videos;")
    # assumes titles is a list of tuples, where each tuple contains the title at index 0
    stems_list = []

    for t_tuple in titles:
        t = t_tuple[0]
        stems_list.extend(process_title(t))

    kmostcommon = Counter(stems_list).most_common(k)
    if not remove_single_occ:
        return kmostcommon
    else:
        return [(stem, count) for stem, count in kmostcommon if count > 1]


if __name__ == '__main__':
    video_ids = [tup[0] for tup in query_videos("SELECT id FROM videos;")]
    stem_to_words, common_stem_counter = smarter_topkstems(video_ids, 50)
    print(common_stem_counter)
    plot_smarter_topkstems(stem_to_words, common_stem_counter)
    # print(channeltitle_topkstems(video_ids,50))
def category_plot_all():
    # No params or return
    # Plots all durations vs viewCount

    duration_results = query_videos(
        "SELECT id, categoryId, viewCount FROM videos WHERE viewCount IS NOT NULL;"
    )

    # TODO: get this info from youtube data API
    id_name = {
        1: 'Film & Animation',
        2: 'Autos & Vehicles',
        10: 'Music',
        15: 'Pets & Animals',
        17: 'Sports',
        18: 'Short Movies',
        19: 'Travel & Events',
        20: 'Gaming',
        21: 'Videoblogging',
        22: 'People & Blogs',
        23: 'Comedy',
        24: 'Entertainment',
        25: 'News & Politics',
        26: 'Howto & Style',
        27: 'Education',
        28: 'Science & Technology',
        29: 'Nonprofits & Activism',
        30: 'Movies',
        31: 'Anime/Animation',
        32: 'Action/Adventure',
        33: 'Classics',
        34: 'Comedy',
        35: 'Documentary',
        36: 'Drama',
        37: 'Family',
        38: 'Foreign',
        39: 'Horror',
        40: 'Sci-Fi/Fantasy',
        41: 'Thriller',
        42: 'Shorts',
        43: 'Shows',
        44: 'Trailers'
    }

    category_count = {}

    for r_tuple in duration_results:
        vid, cat, views = r_tuple
        if cat in category_count:
            category_count[cat] += 1
        else:
            category_count[cat] = 1

    print(category_count[2])

    # del category_count['Autos & Vehicles']
    # del category_count['Nonprofits & Activism']
    # del category_count['Pets & Animals']
    # del category_count['Music']
    # del category_count['Film & Animation']
    # del category_count['Sports']
    # del category_count['Science & Technology']
    # del category_count['News & Politics']

    del category_count[2]
    del category_count[29]
    del category_count[15]
    del category_count[10]
    del category_count[1]
    del category_count[17]
    del category_count[28]
    del category_count[25]

    num_videos = sum(category_count.values())

    categories = []

    def round_to_1(x):
        return round(x, -int(floor(log10(abs(x)))))

    for key, value in category_count.items():
        categories.append(id_name[key] + " " +
                          str(round_to_1((value / num_videos) * 100)) + "%")

    # y_pos = np.arange(len(category_count))
    # category_ids = list(category_count.keys())
    #
    #
    # for id in category_ids:
    #
    #
    vals = list(category_count.values())

    # plt.bar(y_pos, vals, align = 'center', alpha=0.5)
    # plt.xticks(y_pos, categories, rotation='vertical')
    # plt.xlabel('Category')
    # plt.ylabel('Number of Videos')
    # plt.title('Number of Videos per Category')

    patches, texts = plt.pie(vals, startangle=90)

    plt.legend(patches, categories, loc="best")
    plt.axis('equal')
    plt.tight_layout()
    plt.show()
def category_plot_averages():
    # No params or return
    # Plots durations vs average viewCount for that duration,
    # and then uses polyfit to draw a polynomial line representing this curve

    category_results = query_videos(
        "SELECT id, categoryId, viewCount FROM videos WHERE viewCount IS NOT NULL;"
    )

    # TODO: get this info from youtube data API
    id_name = {
        1: 'Film & Animation',
        2: 'Autos & Vehicles',
        10: 'Music',
        15: 'Pets & Animals',
        17: 'Sports',
        18: 'Short Movies',
        19: 'Travel & Events',
        20: 'Gaming',
        21: 'Videoblogging',
        22: 'People & Blogs',
        23: 'Comedy',
        24: 'Entertainment',
        25: 'News & Politics',
        26: 'Howto & Style',
        27: 'Education',
        28: 'Science & Technology',
        29: 'Nonprofits & Activism',
        30: 'Movies',
        31: 'Anime/Animation',
        32: 'Action/Adventure',
        33: 'Classics',
        34: 'Comedy',
        35: 'Documentary',
        36: 'Drama',
        37: 'Family',
        38: 'Foreign',
        39: 'Horror',
        40: 'Sci-Fi/Fantasy',
        41: 'Thriller',
        42: 'Shorts',
        43: 'Shows',
        44: 'Trailers'
    }

    category_to_viewlist = {}

    for r_tuple in category_results:
        vid, category, views = r_tuple
        if category in category_to_viewlist:
            category_to_viewlist[category].append(views)
        else:
            category_to_viewlist[category] = [views]
    category_to_viewavg = {
        id_name[category]: sum(viewlist) / len(viewlist)
        for category, viewlist in category_to_viewlist.items()
    }

    x = list(category_to_viewavg.keys())
    y_avg = [category_to_viewavg[category] for category in x]

    plt.xlabel('Categories')
    plt.ylabel('View Counts')

    y_pos = np.arange(len(y_avg))

    plt.bar(y_pos, y_avg, align='center')
    plt.xticks(y_pos, x, rotation='vertical')
    plt.xlabel('Category')
    plt.ylabel('Average Number of Views')
    plt.title('Average Number of Views per Video for Each Category')
    plt.show()