Beispiel #1
0
def get_tweets_by_openid(open_id):
    """
    Return a list of string, which is the tweet info
    """

    latest_tweets = []

    sogou_tweet_info_content = get_weixinsogou_tweet_info_by_openid(open_id)
    if sogou_tweet_info_content is None:
        return None

    tweet_urls = get_tweet_urls_by_content(sogou_tweet_info_content)

    # if no tweet info just return nothing
    if tweet_urls is None:
        return None

    # first add sogou_tweet_info_content to latest_tweets
    latest_tweets.append(sogou_tweet_info_content)

    for tweet_url in tweet_urls:
        log('crawl open_id --> %s, the url is %s' % (open_id, tweet_url))
        weixin_tweet_content = None

        # connect to the website with proxy_ip and faked user-agent
        con_weixin_tweet_content = get_connect_by_proxyip_ua(tweet_url)

        if con_weixin_tweet_content is not None:
            weixin_tweet_content = con_weixin_tweet_content.read()

        latest_tweets.append(weixin_tweet_content)

        sleep_for_a_while_small()

    return latest_tweets
Beispiel #2
0
def get_tweets_by_openid(open_id):
    """
    Return a list of string, which is the tweet info
    """

    latest_tweets = []

    sogou_tweet_info_content = get_weixinsogou_tweet_info_by_openid(open_id)
    if sogou_tweet_info_content is None:
        return None

    tweet_urls = get_tweet_urls_by_content(sogou_tweet_info_content)

    # if no tweet info just return nothing
    if tweet_urls is None:
        return None

    # first add sogou_tweet_info_content to latest_tweets
    latest_tweets.append(sogou_tweet_info_content)

    for tweet_url in tweet_urls:
        log('crawl open_id --> %s, the url is %s' % (open_id, tweet_url))
        weixin_tweet_content = None

        # connect to the website with proxy_ip and faked user-agent
        con_weixin_tweet_content = get_connect_by_proxyip_ua(tweet_url)

        if con_weixin_tweet_content is not None:
            weixin_tweet_content = con_weixin_tweet_content.read()

        latest_tweets.append(weixin_tweet_content)

        sleep_for_a_while_small()

    return latest_tweets
Beispiel #3
0
def get_info_by_single_nav_page(page_num, keywords, weibo_id):
    """ Find weixin info in single nav_page on weixin.sogou.com

    Return a list of 3 items
    The first one indicates whether the account found
    The second is the info of the account
    The third one indicates whether is the last nav page
    """

    # First set the is_existed to be False
    is_existed = False

    # the info of the weibo_id found on weixin.sogou.com
    existed_account_info = 'NA'

    # get the url by keywords
    tmp_nav_url = get_nav_page_url_by_keywords(keywords)
    nav_url = tmp_nav_url[0] + str(page_num) + tmp_nav_url[1]

    print "nav_url ->  ", nav_url
    # connect to the website, and build soup
    # get connect to the website
    c = get_connect_by_proxyip_ua(nav_url)

    if (c is None):
        return None

    # build soup
    soup_obj = BeautifulSoup(c.read())

    if (soup_obj is None):
        return None

    is_last_page = is_last_page_by_soup(soup_obj)

    # print soup_obj.prettify()

    # parse the soup, and get the info tag
    all_div = soup_obj.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item")

    if (all_div is None):
        return None

    for info_div in all_div:

        # store all the info by single tag
        weibo_info = get_info_by_tag(info_div, keywords)

        # if find the same weibo_id, then set is_existed to be True, and store
        # the info
        if (is_existed is False and weibo_info['weibo_id'] == weibo_id):
            print "The weibo_id has been found, set is_existed to be True"
            is_existed = True
            existed_account_info = weibo_info

            return (is_existed, existed_account_info, is_last_page)

    return (is_existed, existed_account_info, is_last_page)
Beispiel #4
0
def get_weixinsogou_tweet_info_by_openid(open_id):
    """
    Return a string, which is the tweet_info of a single open_id in sogou.com
    """
    sogou_tweet_info_content = None

    tweet_info_url = get_url_info_from_sogou(open_id)

    # connect to the website with proxy_ip and faked user-agent
    con_tweet_info = get_connect_by_proxyip_ua(tweet_info_url)

    if con_tweet_info is not None:
        sogou_tweet_info_content = con_tweet_info.read()

    return sogou_tweet_info_content
Beispiel #5
0
def get_weixinsogou_tweet_info_by_openid(open_id):
    """
    Return a string, which is the tweet_info of a single open_id in sogou.com
    """
    sogou_tweet_info_content = None

    tweet_info_url = get_url_info_from_sogou(open_id)

    # connect to the website with proxy_ip and faked user-agent
    con_tweet_info = get_connect_by_proxyip_ua(tweet_info_url)

    if con_tweet_info is not None:
        sogou_tweet_info_content = con_tweet_info.read()

    return sogou_tweet_info_content
def get_single_account_info_by_homepage_url(url):
    # Return a list of two items
    # The first one indicates whether weibo_id found
    # The second is the all_info found for weibo_id
    # If not found return None

    c = get_connect_by_proxyip_ua(url)
    soup_obj = BeautifulSoup(c.read())

    if (soup_obj is None):
        return None

    account_info = get_info_by_homepage_soup(soup_obj, url)
    is_existed = True

    return (is_existed, account_info)
def get_new_account_info_by_single_nav_page(page_num, keyword):
    """
    Return a tuple of 2 items
    The first is the info of a account
    The second is bool indicates whether is the last nav page
    """

    single_nav_info_list = []

    # get the url by keywords
    tmp_nav_url = get_nav_page_url_by_keywords(keyword)
    nav_url = tmp_nav_url[0] + str(page_num) + tmp_nav_url[1]

    print "nav_url ->  ", nav_url
    # connect to the website, and build soup
    # get connect to the website
    c = get_connect_by_proxyip_ua(nav_url)

    if (c is None):
        return None

    # build soup
    soup_obj = BeautifulSoup(c.read())

    if (soup_obj is None):
        return None

    is_last_page = is_last_page_by_soup(soup_obj)

    # print soup_obj.prettify()

    # parse the soup, and get the info tag
    all_divs = soup_obj.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item")

    if (all_divs is None):
        return None

    for info_div in all_divs:

        # store all the info by single tag
        weibo_info = get_info_by_tag(info_div, keyword)

        if weibo_info is not None:
            single_nav_info_list.append(weibo_info)

    return (single_nav_info_list, is_last_page)
Beispiel #8
0
def get_new_account_info_by_single_nav_page(page_num, keyword):
    """
    Return a tuple of 2 items
    The first is the info of a account
    The second is bool indicates whether is the last nav page
    """

    single_nav_info_list = []

    # get the url by keywords
    tmp_nav_url = get_nav_page_url_by_keywords(keyword)
    nav_url = tmp_nav_url[0] + str(page_num) + tmp_nav_url[1]

    print "nav_url ->  ", nav_url
    # connect to the website, and build soup
    # get connect to the website
    c = get_connect_by_proxyip_ua(nav_url)

    if (c is None):
        return None

    # build soup
    soup_obj = BeautifulSoup(c.read())

    if (soup_obj is None):
        return None

    is_last_page = is_last_page_by_soup(soup_obj)

    # print soup_obj.prettify()

    # parse the soup, and get the info tag
    all_divs = soup_obj.find_all("div", class_="wx-rb bg-blue wx-rb_v1 _item")

    if (all_divs is None):
        return None

    for info_div in all_divs:

        # store all the info by single tag
        weibo_info = get_info_by_tag(info_div, keyword)

        if weibo_info is not None:
            single_nav_info_list.append(weibo_info)

    return (single_nav_info_list, is_last_page)