Ejemplo n.º 1
0
def main(notify):

    parser = HTMLParser()
    session = Session()

    subreddit_count = session.query(Subreddit).count()
    start_count = session.query(DiscoveredSub).count()
    notify("discovering from %d exiting" % subreddit_count)

    discovered_subs = set()
    query = session.query(Subreddit.description_html).filter(Subreddit.description_html != None)
    dbi = DBIterator(query=query)

    for sub in dbi.results_iter():
        links = set(map(lambda s: u'/r/' + s.lower().strip() + u'/', find_sub_links(parser.unescape(sub.description_html))))

        if len(links) == 0:
            continue

        existing = set(map(lambda s: s.url.lower().strip(), session.query(Subreddit.url).filter(Subreddit.url.in_(links))))
        found    = set(map(lambda s: s.url.lower().strip(), session.query(DiscoveredSub.url).filter(DiscoveredSub.url.in_(links))))
        new_subs = (links - existing) - found

        if len(new_subs) > 0:
            discovered_subs.update(new_subs)

        if len(discovered_subs) > 25:
            add_new_subs(session, discovered_subs)

    if len(discovered_subs) > 0:
        add_new_subs(session, discovered_subs)

    end_count = session.query(DiscoveredSub).count()
    notify("found additional %d" % (end_count - start_count))
def main(notify):

    g = nx.Graph()
    out_filename = "data/subreddits_edged_by_description_links.gexf"
    parser = HTMLParser()
    session = Session()
    query = session.query(Subreddit)
    dbi = DBIterator(query=query)

    for subreddit in dbi.results_iter():
        sub = subreddit.url.split("/")[2].lower()

        initialize_node(g, sub)

        if not subreddit.description_html:
            continue

        html = parser.unescape(subreddit.description_html)
        for linked_sub in find_sub_links(html):
            if g.has_edge(sub, linked_sub):
                g[sub][linked_sub]["weight"] += 1
            else:
                g.add_edge(sub, linked_sub, weight=1)

    nx.write_gexf(g, out_filename)