예제 #1
0
def populate_node(root_node):
    process_list = [root_node]
    while process_list:
        # Remove one node at a time from the list
        process_node = process_list.pop(0)
        # Find a reply to the root node
        children_query = collection.find({'in_reply_to_status_id': process_node.id})
        for child in children_query:
            child_node = AnyNode(id=child['id'], name=child['user']['name'], text=child['text'], parent=process_node)
            child_node.id = child['id']
            process_list.insert(0, child_node)  # put on top the stack
예제 #2
0
def export_conversation_trees_to_db(user_id):
    root_id_list = []
    root_nodes_list = []

    print("Selecting all tweets from id: {}...".format(user_id))
    conversation_query = collection.find(
        {'user.id': user_id, 'in_reply_to_user_id': {"$ne": None}})  # Mentioning somebody
    conversation_query2 = collection.find({'in_reply_to_user_id': user_id})  # Mentioned by somebody

    print("SET A: {} B: {} ".format(conversation_query.count(), conversation_query2.count()))

    for i, tweet in enumerate(conversation_query):
        t = find_root_tweet(tweet)
        if t['id'] not in process_set:
            process_set.add(t['id'])
            root_id_list.append(t)
            if i % 1000 == 0:
                print("Processing mentions {}".format(i))

    for i, tweet in enumerate(conversation_query2):
        t = find_root_tweet(tweet)
        if t['id'] not in process_set:
            process_set.add(t['id'])
            root_id_list.append(t)
            if i % 1000 == 0:
                print("Processing mentioned {}".format(i))

    print("SET A: {} B: {} UNION: {}".format(conversation_query.count(), conversation_query2.count(), len(process_set)))

    for i, root_tweet in enumerate(root_id_list):
        root_nodes = AnyNode(id=root_tweet['id'], name=root_tweet['user']['name'], text=root_tweet['text'])
        root_nodes.id = root_tweet['id']  # Not sure if why I need to state this two times.
        if i % 1000 == 0:
            pprint.pprint("Populating: {}".format(i))
        populate_node(root_nodes)
        root_nodes_list.append(root_nodes)
    pprint.pprint('Total sum of root nodes: {}'.format(len(root_id_list)))
    # DotExporter(root_nodes_list[0]).to_picture("test.png")
    count = 0
    for i, tree in enumerate(root_nodes_list):
        count = count + len(tree.descendants) + 1
        exporter = JsonExporter(indent=2, sort_keys=True)
        json_tree = exporter.export(tree)

        collection_trees.insert_one(json.loads(json_tree))

    print("Total conversation count: {}".format(count))
    print("Average conversation length: {}".format(count / (len(root_id_list))))