def crawl_users(): """ Crawl all habrausers, enumerating habrakarma pages. """ page_num = 1 users = set() def flush_userbase(): ut.data2pickle(users, "../data/allusers.dat") for user in users: user = user.replace("\n", "") filename = "../data/users/" + user + "@.dat" if not os.path.exists(filename): print "crawling user: <%s>" % user user_data = cu.crawl_user(user) ut.data2pickle(user_data, filename) if 0: while True: url_to_parse = "http://habrahabr.ru/people/page%d/" % page_num root = ut.doc4url(url_to_parse) if not root: break items = root.xpath('//div[@class="username"]//a') print "Page = ", page_num if len(items) > 0: new_users = set([ut.unicodeanyway(node.text) for node in items]) users.update(new_users) page_num += 1 users = ut.pickle2data("../data/allusers.dat") # if page_num % 1000 == 0: # flush_userbase() flush_userbase()
def smoke_test(): """ Simple smoke/acceptance test. Crawl user info, store them in pickle file. """ habrauser = '******' user = crawl_user(habrauser) print user ut.data2pickle(user, '../data/users/' + habrauser + '.dat')
def flush_userbase(): ut.data2pickle(users, "../data/allusers.dat") for user in users: user = user.replace("\n", "") filename = "../data/users/" + user + "@.dat" if not os.path.exists(filename): print "crawling user: <%s>" % user user_data = cu.crawl_user(user) ut.data2pickle(user_data, filename)
def reload_users(): """ Build friends graph from full habrauser DB and store it as edgelist (easy readable from igraph). """ user_files_list = os.listdir(USERS_DIR) for userid, filename in enumerate(user_files_list): user = ut.pickle2data( os.path.join(USERS_DIR, filename) ) so_user = so.SmartObject(user) ut.data2pickle(so_user, os.path.join(USERS_DIR, filename))
def build_friends_edgelist(): """ Build friends graph from full habrauser DB and store it as edgelist (easy readable from igraph). """ users_dir = ha.get_users_dir() user_files_list = os.listdir(users_dir) user_files_list.sort() login2id = {} for userid, filename in enumerate(user_files_list): login = filename.split(".")[0].replace("@", "") login2id[login] = userid edgefile = open(os.path.join(ha.get_graph_dir(), "friends.edgelist"), "w") for userid, filename in enumerate(user_files_list): user = so.SmartObject(ut.pickle2data(os.path.join(users_dir, filename))) login = filename.split(".")[0].replace("@", "") if user.friends: for friend in user.friends: if friend in login2id: friend_id = login2id[friend] edgefile.write("%d %d\n" % (userid, friend_id)) edgefile.close()
def cut_thread(postpath): """ Build a graph matrix, set MAX-CUT problem """ thepostsdir = os.path.join(ha.get_data_dir(), 'posts') filename = os.path.join(thepostsdir, postpath) + '.dat' thread_tree = ut.pickle2data(filename) def walk_tree_for_login2id(subtree, login2id): for login in subtree: if login not in login2id: login2id[login] = len(login2id) walk_tree_for_login2id(subtree[login], login2id) login2id = {} walk_tree_for_login2id(thread_tree, login2id) N = len(login2id) weights = np.zeros((N, N), dtype=np.int16) def walk_tree_for_weights(root_login, subtree, weights): u = login2id[root_login] for login in subtree: v = login2id[login] if u != v: weights[u, v] += 1 weights[v, u] += 1 walk_tree_for_weights(login, subtree[login], weights) for root_login in thread_tree: walk_tree_for_weights(root_login, thread_tree[root_login], weights) id2login = {} for login in login2id: id2login[login2id[login]] = login y = greedy_max_cut(weights) def print_habrauser(uid): login = id2login[uid] print '* [http://' + login + '.habrahabr.ru ' + login + ']' print "Analysis of http://habrahabr.ru/" + postpath print "----" print "Party 1" for i in xrange(N): if y[i] > 0: print_habrauser(i) print "----" print "Party 2" for i in xrange(N): if y[i] < 0: print_habrauser(i) print "----"
def get_comment_tree(postpath): url_to_parse = 'http://habrahabr.ru/' + postpath root = ut.doc4url(url_to_parse) if not root: return None author = root.xpath('//div[@class="author"]/a')[0].text print author comment_root_tree = {} ## Словарь вложенных словарей ## автор ## |→ автор_комментарария ## |→ автор подкомментария def dfs_process(node, tree): """ Рекурсивно идет вглубь от node и набивает словарь-дерево tree """ print node.get('id') comments = node.xpath('.//div[@id="comments" or @class="reply_comments"]')[0] for comment in comments.xpath('./div[@class="comment_item"]'): author = comment.xpath('.//a[@class="username"]')[0].text print author child_tree = {} dfs_process(comment, child_tree) tree[author] = deepcopy(child_tree) dfs_process(root, comment_root_tree) comment_tree = {author: comment_root_tree} print 'tree:', comment_tree thepostsdir = os.path.join(ha.get_data_dir(), 'posts') filename = os.path.join(thepostsdir, postpath) + '.dat' ut.createdir(os.path.split(filename)[0]) ut.data2pickle(comment_tree, filename)
def get_set(css_class_name, set_num=0): """ Find in the page list of some hyperlinked properties (such as friends, interests, etc) and return a set of them. """ if not root: return None item = root.xpath('//dl[@class="%s"]/dd' % css_class_name) if len(item) <= set_num: return None sets_node = item[set_num] item_set = set([ut.unicodeanyway(node.text).replace('\n', '') for node in sets_node.xpath('.//a') if node.text is not None]) return item_set
def crawl_user(username): """ Crawl habrauser info, return dictionary with these attributes. """ url_to_parse = 'http://habrahabr.ru/users/' + username + '/' root = ut.doc4url(url_to_parse) def get_set(css_class_name, set_num=0): """ Find in the page list of some hyperlinked properties (such as friends, interests, etc) and return a set of them. """ if not root: return None item = root.xpath('//dl[@class="%s"]/dd' % css_class_name) if len(item) <= set_num: return None sets_node = item[set_num] item_set = set([ut.unicodeanyway(node.text).replace('\n', '') for node in sets_node.xpath('.//a') if node.text is not None]) return item_set user = so.SmartObject({ 'interests' : get_set('interests'), 'companies' : get_set('companies_list'), 'friends' : get_set('friends_list'), 'hubs' : get_set('hubs_list'), 'invitees': get_set('friends_list', 1) }) return user