Beispiel #1
0
def members_html(url):
    driver = driver_facebook()
    driver.get(url)
    execute_times(driver, 2000)
    html = driver.page_source
    # with open("group_members.html", "w", encoding="utf-8") as f:
    #     f.write(html)
    driver.close()
    return html
Beispiel #2
0
def posts_index():
    driver = driver_facebook()
    driver.get(
        "https://www.facebook.com/groups/southmongoliasupport//?ref=direct")
    time.sleep(2)
    execute_times(driver, 2000)
    posts_html = driver.page_source

    with open("posts_index.html", "w", encoding='utf-8') as f:
        f.write(posts_html)
    log('posts_html 写入文件夹')
    return posts_html
Beispiel #3
0
def main():
    driver = driver_facebook()
    time.sleep(2)
    driver.get(
        'https://www.facebook.com/profile.php?id=100018160331338&lst=100005036989194%3A100018160331338%3A1529916881&sk=friends&source_ref=pb_friends_tl'
    )
    time.sleep(2)
    execute_times(driver, 70)
    html = driver.page_source

    with open("friends_all.html", 'w', encoding='utf-8') as f:
        f.write(html)
    # with open("friends_all.html", 'r', encoding='utf-8') as f:
    #     html = f.read()

    all_url = parse_index(html)
Beispiel #4
0
def parse_url(url_dict):
    driver = driver_facebook()
    for count, u in enumerate(url_dict):
        # if count <= 100:
        #     continue
        try:
            link = u.get('link')
            name = u.get('name')
            log("begin name{}".format(name))
            driver.get(link)
            time.sleep(1)
            index_html = driver.page_source

            post = personal_data(index_html)

            post.account_name = name
            post.home_page = link

            urun['test'].insert(
                {
                    "account_name": post.account_name,
                    'home_page': post.home_page,
                    'location': post.location,
                    'come_form': post.come_form,
                    "job": post.job,
                    'followers': post.followers,
                    "degree": post.degree,
                    "sex": post.sex,
                    "is_get": True
                 }
            )

            log("insert {} sucessful".format(post.account_name))
            time.sleep(randint(2, 5))
            if count >= 10:
                break
        except Exception as e:
            log(count, e)
            continue
Beispiel #5
0
def parse_members_url(url_dict):
    driver = driver_facebook()
    error_count = 0
    for count, u in enumerate(url_dict):
        # if count <= 10:
        #     log("skip {} {}".format(count, u.get('name')))
        #     continue
        link = u.get('url')
        name = u.get('name')
        log("begin {} : {}", count, name)
        try:
            driver.get(link)
            time.sleep(2)
            post = MembersData()
            index_html = driver.page_source
            data_sex = re.findall(
                r'"addFriendText".*?<', index_html) or re.findall(
                    r'<span class="FollowLink">.*?</span>', index_html)
            log(data_sex)
            if data_sex != []:
                if '他' in data_sex[0]:
                    post.sex = 'man'
                if "她" in data_sex[0]:
                    post.sex = "woman"

            profile = re.findall(
                r'<div id="intro_container_id">.*?</ul></div>', index_html)
            if profile == []:
                error_count += 1
                log("error {} : {} {}".format(error_count, count, link))
            e = pq(profile[0])
            all_profile = e.text()
            log(all_profile)
            list_profile = all_profile.split("\n")

            for item in list_profile:
                if ("曾经" in item or '就读于' in item) and post.degree == '':
                    post.degree = item
                elif "所在地" in item:
                    post.location = item
                elif "来自" in item:
                    post.come_form = item
                elif "粉丝" in item:
                    post.followers = item
                elif "-" in item and post.job == '' and '曾经' not in item:
                    post.job = item

            post.account_name = name
            post.home_page = link
            log("post", post)
            urun['test'].insert({
                "account_name": post.account_name,
                'home_page': post.home_page,
                'location': post.location,
                'come_form': post.come_form,
                "job": post.job,
                'followers': post.followers,
                "degree": post.degree,
                "sex": post.sex,
                "is_get": True,
            })
            log("insert {} sucessful".format(name))
            # if count >= 20:
            #     break
        except Exception as e:
            log(count, name, e)
            continue