def get_tweets_by_openid(open_id): """ Return a list of string, which is the tweet info """ latest_tweets = [] sogou_tweet_info_content = get_weixinsogou_tweet_info_by_openid(open_id) if sogou_tweet_info_content is None: return None tweet_urls = get_tweet_urls_by_content(sogou_tweet_info_content) # if no tweet info just return nothing if tweet_urls is None: return None # first add sogou_tweet_info_content to latest_tweets latest_tweets.append(sogou_tweet_info_content) for tweet_url in tweet_urls: log('crawl open_id --> %s, the url is %s' % (open_id, tweet_url)) weixin_tweet_content = None # connect to the website with proxy_ip and faked user-agent con_weixin_tweet_content = get_connect_by_proxyip_ua(tweet_url) if con_weixin_tweet_content is not None: weixin_tweet_content = con_weixin_tweet_content.read() latest_tweets.append(weixin_tweet_content) sleep_for_a_while_small() return latest_tweets
def get_tweets_by_openid(open_id): """ Return a list of string, which is the tweet info """ latest_tweets = [] sogou_tweet_info_content = get_weixinsogou_tweet_info_by_openid(open_id) if sogou_tweet_info_content is None: return None tweet_urls = get_tweet_urls_by_content(sogou_tweet_info_content) # if no tweet info just return nothing if tweet_urls is None: return None # first add sogou_tweet_info_content to latest_tweets latest_tweets.append(sogou_tweet_info_content) for tweet_url in tweet_urls: log('crawl open_id --> %s, the url is %s' % (open_id, tweet_url)) weixin_tweet_content = None # connect to the website with proxy_ip and faked user-agent con_weixin_tweet_content = get_connect_by_proxyip_ua(tweet_url) if con_weixin_tweet_content is not None: weixin_tweet_content = con_weixin_tweet_content.read() latest_tweets.append(weixin_tweet_content) sleep_for_a_while_small() return latest_tweets
def get_info_by_single_nav_page(page_num, keywords, weibo_id): """ Find weixin info in single nav_page on weixin.sogou.com Return a list of 3 items The first one indicates whether the account found The second is the info of the account The third one indicates whether is the last nav page """ # First set the is_existed to be False is_existed = False # the info of the weibo_id found on weixin.sogou.com existed_account_info = 'NA' # get the url by keywords tmp_nav_url = get_nav_page_url_by_keywords(keywords) nav_url = tmp_nav_url[0] + str(page_num) + tmp_nav_url[1] print "nav_url -> ", nav_url # connect to the website, and build soup # get connect to the website c = get_connect_by_proxyip_ua(nav_url) if (c is None): return None # build soup soup_obj = BeautifulSoup(c.read()) if (soup_obj is None): return None is_last_page = is_last_page_by_soup(soup_obj) # print soup_obj.prettify() # parse the soup, and get the info tag all_div = soup_obj.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item") if (all_div is None): return None for info_div in all_div: # store all the info by single tag weibo_info = get_info_by_tag(info_div, keywords) # if find the same weibo_id, then set is_existed to be True, and store # the info if (is_existed is False and weibo_info['weibo_id'] == weibo_id): print "The weibo_id has been found, set is_existed to be True" is_existed = True existed_account_info = weibo_info return (is_existed, existed_account_info, is_last_page) return (is_existed, existed_account_info, is_last_page)
def get_weixinsogou_tweet_info_by_openid(open_id): """ Return a string, which is the tweet_info of a single open_id in sogou.com """ sogou_tweet_info_content = None tweet_info_url = get_url_info_from_sogou(open_id) # connect to the website with proxy_ip and faked user-agent con_tweet_info = get_connect_by_proxyip_ua(tweet_info_url) if con_tweet_info is not None: sogou_tweet_info_content = con_tweet_info.read() return sogou_tweet_info_content
def get_weixinsogou_tweet_info_by_openid(open_id): """ Return a string, which is the tweet_info of a single open_id in sogou.com """ sogou_tweet_info_content = None tweet_info_url = get_url_info_from_sogou(open_id) # connect to the website with proxy_ip and faked user-agent con_tweet_info = get_connect_by_proxyip_ua(tweet_info_url) if con_tweet_info is not None: sogou_tweet_info_content = con_tweet_info.read() return sogou_tweet_info_content
def get_single_account_info_by_homepage_url(url): # Return a list of two items # The first one indicates whether weibo_id found # The second is the all_info found for weibo_id # If not found return None c = get_connect_by_proxyip_ua(url) soup_obj = BeautifulSoup(c.read()) if (soup_obj is None): return None account_info = get_info_by_homepage_soup(soup_obj, url) is_existed = True return (is_existed, account_info)
def get_new_account_info_by_single_nav_page(page_num, keyword): """ Return a tuple of 2 items The first is the info of a account The second is bool indicates whether is the last nav page """ single_nav_info_list = [] # get the url by keywords tmp_nav_url = get_nav_page_url_by_keywords(keyword) nav_url = tmp_nav_url[0] + str(page_num) + tmp_nav_url[1] print "nav_url -> ", nav_url # connect to the website, and build soup # get connect to the website c = get_connect_by_proxyip_ua(nav_url) if (c is None): return None # build soup soup_obj = BeautifulSoup(c.read()) if (soup_obj is None): return None is_last_page = is_last_page_by_soup(soup_obj) # print soup_obj.prettify() # parse the soup, and get the info tag all_divs = soup_obj.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item") if (all_divs is None): return None for info_div in all_divs: # store all the info by single tag weibo_info = get_info_by_tag(info_div, keyword) if weibo_info is not None: single_nav_info_list.append(weibo_info) return (single_nav_info_list, is_last_page)
def get_new_account_info_by_single_nav_page(page_num, keyword): """ Return a tuple of 2 items The first is the info of a account The second is bool indicates whether is the last nav page """ single_nav_info_list = [] # get the url by keywords tmp_nav_url = get_nav_page_url_by_keywords(keyword) nav_url = tmp_nav_url[0] + str(page_num) + tmp_nav_url[1] print "nav_url -> ", nav_url # connect to the website, and build soup # get connect to the website c = get_connect_by_proxyip_ua(nav_url) if (c is None): return None # build soup soup_obj = BeautifulSoup(c.read()) if (soup_obj is None): return None is_last_page = is_last_page_by_soup(soup_obj) # print soup_obj.prettify() # parse the soup, and get the info tag all_divs = soup_obj.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item") if (all_divs is None): return None for info_div in all_divs: # store all the info by single tag weibo_info = get_info_by_tag(info_div, keyword) if weibo_info is not None: single_nav_info_list.append(weibo_info) return (single_nav_info_list, is_last_page)