Esempio n. 1
0
def crawl_users():
    """
    Crawl all habrausers, enumerating habrakarma pages.
    """
    page_num = 1
    users = set()

    def flush_userbase():
        ut.data2pickle(users, "../data/allusers.dat")
        for user in users:
            user = user.replace("\n", "")
            filename = "../data/users/" + user + "@.dat"
            if not os.path.exists(filename):
                print "crawling user: <%s>" % user
                user_data = cu.crawl_user(user)
                ut.data2pickle(user_data, filename)

    if 0:
        while True:
            url_to_parse = "http://habrahabr.ru/people/page%d/" % page_num
            root = ut.doc4url(url_to_parse)
            if not root:
                break
            items = root.xpath('//div[@class="username"]//a')
            print "Page = ", page_num
            if len(items) > 0:
                new_users = set([ut.unicodeanyway(node.text) for node in items])
                users.update(new_users)
            page_num += 1
    users = ut.pickle2data("../data/allusers.dat")
    # if page_num % 1000 == 0:
    #    flush_userbase()

    flush_userbase()
Esempio n. 2
0
def smoke_test():
    """
    Simple smoke/acceptance test.
    Crawl user info, store them in pickle file.
    """
    habrauser = '******'
    user = crawl_user(habrauser)
    print user
    ut.data2pickle(user, '../data/users/' + habrauser + '.dat')
Esempio n. 3
0
 def flush_userbase():
     ut.data2pickle(users, "../data/allusers.dat")
     for user in users:
         user = user.replace("\n", "")
         filename = "../data/users/" + user + "@.dat"
         if not os.path.exists(filename):
             print "crawling user: <%s>" % user
             user_data = cu.crawl_user(user)
             ut.data2pickle(user_data, filename)
Esempio n. 4
0
def reload_users():
    """
     Build friends graph 
     from full habrauser DB and
     store it as edgelist (easy readable from igraph). 
    """
    user_files_list = os.listdir(USERS_DIR) 
    for userid, filename in enumerate(user_files_list):
        user = ut.pickle2data( os.path.join(USERS_DIR, filename) )
        so_user = so.SmartObject(user)
        ut.data2pickle(so_user, os.path.join(USERS_DIR, filename))
def build_friends_edgelist():
    """
     Build friends graph 
     from full habrauser DB and
     store it as edgelist (easy readable from igraph). 
    """
    users_dir = ha.get_users_dir()
    user_files_list = os.listdir(users_dir)
    user_files_list.sort()
    login2id = {}
    for userid, filename in enumerate(user_files_list):
        login = filename.split(".")[0].replace("@", "")
        login2id[login] = userid

    edgefile = open(os.path.join(ha.get_graph_dir(), "friends.edgelist"), "w")

    for userid, filename in enumerate(user_files_list):
        user = so.SmartObject(ut.pickle2data(os.path.join(users_dir, filename)))
        login = filename.split(".")[0].replace("@", "")

        if user.friends:
            for friend in user.friends:
                if friend in login2id:
                    friend_id = login2id[friend]
                    edgefile.write("%d %d\n" % (userid, friend_id))

    edgefile.close()
Esempio n. 6
0
def cut_thread(postpath):
    """
      Build a graph matrix, set MAX-CUT problem
    """
    thepostsdir = os.path.join(ha.get_data_dir(), 'posts')
    filename = os.path.join(thepostsdir, postpath) + '.dat'
    thread_tree = ut.pickle2data(filename)
    
    def walk_tree_for_login2id(subtree, login2id):
        for login in subtree:
            if login not in login2id:
                login2id[login] = len(login2id) 
            walk_tree_for_login2id(subtree[login], login2id)
            
    login2id = {}
    walk_tree_for_login2id(thread_tree, login2id)
    
    N = len(login2id)
    weights = np.zeros((N, N), dtype=np.int16)

    def walk_tree_for_weights(root_login, subtree, weights):
        u = login2id[root_login]
        for login in subtree:
            v =  login2id[login]
            if u != v:
                weights[u, v] += 1
                weights[v, u] += 1
                walk_tree_for_weights(login, subtree[login], weights)
    
    for root_login in thread_tree:
        walk_tree_for_weights(root_login, thread_tree[root_login], weights)

    id2login = {}
    for login in login2id:
        id2login[login2id[login]] = login

    y = greedy_max_cut(weights)

    def print_habrauser(uid):
        login = id2login[uid]
        print '* [http://' +  login + '.habrahabr.ru ' + login + ']'

    print "Analysis of http://habrahabr.ru/" + postpath
    print "----"
    print "Party 1"
    for i in xrange(N):
        if y[i] > 0:
            print_habrauser(i)

    print "----"
            
    print "Party 2"
    for i in xrange(N):
        if y[i] < 0:
            print_habrauser(i)
            
    print "----"
def get_comment_tree(postpath):
    url_to_parse = 'http://habrahabr.ru/' + postpath
    root = ut.doc4url(url_to_parse)
    if not root:
        return None

    author  = root.xpath('//div[@class="author"]/a')[0].text
    print author
    
    comment_root_tree = {}
    ##  Словарь вложенных словарей
    ##  автор
    ##  |→ автор_комментарария
    ##      |→ автор подкомментария
    def dfs_process(node, tree):
        """
        Рекурсивно идет вглубь от node
        и набивает словарь-дерево tree
        """
        print node.get('id')
        comments = node.xpath('.//div[@id="comments" or @class="reply_comments"]')[0]
        for comment in comments.xpath('./div[@class="comment_item"]'):
            author = comment.xpath('.//a[@class="username"]')[0].text
            print author
            child_tree = {}
            dfs_process(comment, child_tree)
            tree[author] = deepcopy(child_tree)
    
    dfs_process(root, comment_root_tree)
    comment_tree = {author: comment_root_tree}
    print 'tree:', comment_tree
    
    thepostsdir = os.path.join(ha.get_data_dir(), 'posts')
    filename = os.path.join(thepostsdir, postpath) + '.dat'
    ut.createdir(os.path.split(filename)[0])
    ut.data2pickle(comment_tree, filename)
Esempio n. 8
0
 def get_set(css_class_name, set_num=0):
     """
     Find in the page list of some hyperlinked properties
     (such as friends, interests, etc)
     and return a set of them.
     """
     if not root:
         return None
     item = root.xpath('//dl[@class="%s"]/dd' % css_class_name)
     if len(item) <= set_num:
         return None
     sets_node  = item[set_num]
     item_set = set([ut.unicodeanyway(node.text).replace('\n', '')
                      for node
                         in sets_node.xpath('.//a') if node.text is not None])
     
     
     
     return item_set
Esempio n. 9
0
def crawl_user(username):
    """
     Crawl habrauser info,
     return dictionary with these attributes.
    """
    url_to_parse = 'http://habrahabr.ru/users/' + username + '/' 
    root    = ut.doc4url(url_to_parse)

    def get_set(css_class_name, set_num=0):
        """
        Find in the page list of some hyperlinked properties
        (such as friends, interests, etc)
        and return a set of them.
        """
        if not root:
            return None
        item = root.xpath('//dl[@class="%s"]/dd' % css_class_name)
        if len(item) <= set_num:
            return None
        sets_node  = item[set_num]
        item_set = set([ut.unicodeanyway(node.text).replace('\n', '')
                         for node
                            in sets_node.xpath('.//a') if node.text is not None])
        
        
        
        return item_set

    user = so.SmartObject({
        'interests' : get_set('interests'),
        'companies' : get_set('companies_list'),
        'friends' :  get_set('friends_list'),
        'hubs' : get_set('hubs_list'),
        'invitees': get_set('friends_list', 1)
    })    
    return user