예제 #1
0
def double_linked_channels():

    file = dict()

    for filename in os.listdir(CHANNEL_FOLDER):
        if filename.endswith(".json"):
            data = load_json(CHANNEL_FOLDER + filename)
            channel_id = data["channel_id"]
            file[channel_id] = []
            featured_channels_ids = None
            if "featuredChannelsUrls" in data.keys():
                featured_channels_ids = data["featuredChannelsUrls"]

                for id in featured_channels_ids:

                    filepath = CHANNEL_FOLDER + id + ".json"

                    data2 = load_json(filepath)  #!!! get_channel(id)

                    if data2 != None:
                        featured_channels_ids2 = None
                        if "featuredChannelsUrls" in data2.keys():
                            featured_channels_ids2 = data2[
                                "featuredChannelsUrls"]

                            if channel_id in featured_channels_ids2:
                                file[channel_id].append(data2["channel_id"])
                    else:
                        file[channel_id].append(id)
    return file
    save_json(file, "/analytics/double_linked_channels.json")
예제 #2
0
def main():
    print("Starting")

    if len(sys.argv) != 2:
        print("Usage: " + sys.argv[0] + " request_path.json")
        exit()

    filepath = str(sys.argv[1])

    request = load_json(filepath)
    request_id = str(request["request_id"])
    depth = int(request["depth"])
    seed = []
    seed.append(str(request["seed"]))

    print("Generating graph")
    graph = generate_graph(POLICY="cluster_policy", SEED=seed, DEPTH=depth)

    print("Analyzing graph")
    analytics = analyze_graph(graph)

    file = dict()
    file["request"] = request
    file["graph"] = graph
    file["analytics"] = analytics

    print("Saving graph")
    save_json(file, "response_" + request_id + ".json")

    print("Statistics")
    print("vertices", len(graph["vertices"]))
    print("edges", len(graph["edges"]))
예제 #3
0
def gather_keywords():
    keywords = dict()

    for filename in os.listdir(CHANNEL_FOLDER):
        if filename.endswith(".json"):
            #print("Loading file " +  filename)
            j = load_json(CHANNEL_FOLDER + filename)

            keywords[j["channel_id"]] = j["keywords"]
    return keywords
예제 #4
0
def main():
    print("Starting...")

    if len(sys.argv) != 2:
        print("Usage: " + sys.argv[0] + " request_path.json")
        exit()

    filepath = str(sys.argv[1])

    if True:  #maybe you want to update everything
        for filename in os.listdir(CHANNEL_FOLDER):
            if filename.endswith(".json"):
                j = load_json(CHANNEL_FOLDER + filename)
                channel_id = j["channel_id"]
                channel_name = filename.replace(".json", "")
                SAVED_CHANNELS[channel_id] = channel_name

        print("Loaded " + str(len(SAVED_CHANNELS.keys())) + " saved channels")
        #for url in SAVED_CHANNELS:
        #    print(SAVED_CHANNELS[url], "->", url)

    #read in request
    request = load_json(filepath)
    request_id = str(request["request_id"])
    depth = int(request["depth"])
    seed = []
    seed.append(str(request["seed"]))

    print("Processing request", request_id)

    print("Starting crawl with seed", seed, " and depth", depth)
    crawl(SEED=seed, DEPTH=depth)

    print("Crawled", len(CRAWLED_CHANNELS.keys()), "with", NEW_CHANNELS,
          "new channels!")
    for url in CRAWLED_CHANNELS:
        print(
            str(CRAWLED_CHANNELS[url].encode("ascii", "ignore")) + " -> " +
            url)

    print("Done!")
예제 #5
0
def cluster_policy(SEED, DEPTH):
    #[get_channel(id) for id in data[current_id]]

    graph = dict()
    graph["vertices"] = []
    graph["edges"] = []
    graph["depth"] = DEPTH
    graph["vertex_count_at_depth"] = []

    seed = SEED[0]
    linked_channels = load_json("analytics/double_linked_channels.json")

    traversed_channels_ids = []
    current_channel_ids = []
    current_channel_ids.append(seed)

    current_depth = 0

    while current_depth <= DEPTH:
        print("current_depth " + str(current_depth))
        next_ids = []

        y_count = 0
        for current_id in current_channel_ids:
            if current_id in linked_channels.keys():

                vertex = dict()
                vertex["id"] = current_id
                vertex["coords"] = dict()
                vertex["coords"]["x"] = current_depth
                vertex["coords"]["y"] = y_count
                vertex["channel_data"] = get_channel(current_id)
                vertex["color"] = "white"
                graph["vertices"].append(vertex)

                if C_P_INFO:
                    print("Created vertex for channel " +
                          vertex["channel_data"]["title"])

                for linked_id in linked_channels[current_id]:
                    if linked_id not in traversed_channels_ids:

                        if get_channel(linked_id) != None:

                            edge_out_of_max_depth = False
                            if current_depth != DEPTH:
                                if linked_id not in next_ids and linked_id not in current_channel_ids:
                                    next_ids.append(linked_id)
                            else:
                                if linked_id not in current_channel_ids:
                                    edge_out_of_max_depth = True

                            if not edge_out_of_max_depth:
                                #this could be enhanced by only searching
                                #in the current column edges
                                redundant_edge = False
                                for edge in graph["edges"]:
                                    source_id = edge["source_vertex_id"]
                                    dest_id = edge["dest_vertex_id"]

                                    if source_id == current_id and dest_id == linked_id:
                                        redundant_edge = True
                                    elif source_id == linked_id and dest_id == current_id:
                                        redundant_edge = True

                                if not redundant_edge:
                                    edge = dict()
                                    edge["id"] = len(graph["edges"]) + 1
                                    edge["source_vertex_id"] = current_id
                                    edge["dest_vertex_id"] = linked_id
                                    edge["color"] = "white"
                                    graph["edges"].append(edge)
                                else:
                                    pass
                                    #print("Redundant edge")
                                    #print(current_id)
                                    #print(linked_id)

                                    if C_P_INFO:
                                        print("Created edge: " +
                                              vertex["channel_data"]["title"] +
                                              " -> " +
                                              get_channel(linked_id)["title"])
                        else:
                            print("No data for id " + linked_id)
                    #end of linked_id loop

            y_count += 1
            #end of current_id loop

        graph["vertex_count_at_depth"].append(y_count)
        traversed_channels_ids.extend(current_channel_ids)

        current_channel_ids = next_ids
        current_depth += 1

    return graph
예제 #6
0
def crawl_policy(SEED, DEPTH):
    global CHANNEL_FOLDER

    graph = dict()
    graph["vertices"] = []
    graph["edges"] = []
    graph["depth"] = DEPTH
    graph["vertex_count_at_depth"] = []

    current_urls = []
    current_urls.extend(SEED)
    next_urls = []
    traversed_urls = []

    current_depth = 0
    while current_depth <= DEPTH:
        y_count = 0

        for url in current_urls:
            if url not in traversed_urls:

                filename = CHANNEL_FOLDER + url + ".json"

                channel_data = None

                if url + ".json" in os.listdir(CHANNEL_FOLDER):
                    channel_data = load_json(filename)

                if channel_data != None:
                    vertex = dict()
                    vertex["id"] = url
                    vertex["coords"] = dict()
                    vertex["coords"]["x"] = current_depth
                    vertex["coords"]["y"] = y_count
                    vertex["channel_data"] = channel_data
                    vertex["color"] = "white"

                    graph["vertices"].append(vertex)
                    if current_depth != DEPTH:

                        featuredChannelsUrls = channel_data[
                            "featuredChannelsUrls"]

                        for featured_url in featuredChannelsUrls:
                            if featured_url + ".json" in os.listdir(
                                    CHANNEL_FOLDER):
                                if featured_url not in traversed_urls:
                                    next_urls.append(featured_url)

                                    edge = dict()
                                    edge["id"] = len(graph["edges"]) + 1
                                    edge["source_vertex_id"] = url
                                    edge["dest_vertex_id"] = featured_url
                                    edge["color"] = "white"

                                    graph["edges"].append(edge)
                else:
                    print("Error while loading file " + str(filename))

                traversed_urls.append(url)
                y_count += 1

        graph["vertex_count_at_depth"].append(y_count)

        current_urls = next_urls
        next_urls = []
        current_depth += 1

    return graph
예제 #7
0
SUB_ID["OpenMind3000"] = "UCEtYtMoD26j2BJBJ4w_hM6w"
SUB_ID["EEVBlog"] = "UC2DjFE7Xf11URZqWBigcVOQ"
SUB_ID["HandOfBlood"] = "UC9YTp5M6yYgSd6t0SeL2GQw"

#Mapping of URL -> Title
# URL == ChannelID  (Identifier in YoutubeAPI)
# Title             (Name of channel)
SAVED_CHANNELS = dict(
)  #channels that have been crawled before and are known to the system
CRAWLED_CHANNELS = dict()  #channels that were crawled in this session

#STATISTICS
NEW_CHANNELS = 0

#youtube dependent stuff
DEVELOPER_KEY = load_json("key.json")["key"]
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
YOUTUBE = build(YOUTUBE_API_SERVICE_NAME,
                YOUTUBE_API_VERSION,
                developerKey=DEVELOPER_KEY)


def get_channel_id(USERNAME):
    response = YOUTUBE.channels().list(forUsername=USERNAME,
                                       part="id").execute()
    return response["items"][0]['id']


def get_featured_channels(CHANNEL_ID):
    response = YOUTUBE.channels().list(id=CHANNEL_ID,