Exemple #1
0
def get_info(username):
    try:
        browser = init_chromedriver(chrome_options, capabilities)
    except Exception as exc:
        print(exc)
        sys.exit()

    try:
        information = []
        user_commented_list = []

        try:
            if len(Settings.login_username) != 0:
                login(browser, Settings.login_username,
                      Settings.login_password)
            information, user_commented_list = extract_information(
                browser, username, Settings.limit_amount)
        except:
            print("Error with user " + username)
            sys.exit(1)

        Datasaver.save_profile_json(username, information)

    except KeyboardInterrupt:
        print('Aborted...')

    finally:
        browser.delete_all_cookies()
        browser.close()

    return information
def main():
    if len(sys.argv) < 3:
        sys.exit('- Please provide profile to crawl and DB path!\n')
    user_name = sys.argv[1]
    db_path = sys.argv[2]
    proxy = None
    if len(sys.argv) > 3:
        proxy = sys.argv[3]

    chrome_options = Options()
    if proxy is not None:
        chrome_options.add_argument('--proxy-server=%s' % proxy)
    chrome_options.add_argument('--dns-prefetch-disable')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--lang=en-US')
    chrome_options.add_argument('--headless')
    chrome_options.add_experimental_option('prefs',
                                           {'intl.accept_languages': 'en-US'})

    capabilities = DesiredCapabilities.CHROME

    Settings.sleep_time_between_post_scroll = 3
    Settings.sleep_time_between_comment_loading = 3

    try:
        browser = init_chromedriver(chrome_options, capabilities)
    except Exception as exc:
        print(exc)
        sys.exit()

    try:
        print('Extracting posts links from ' + user_name)
        try:
            user_info, indexed_links, preview_images = extract_user_posts_links(
                browser, user_name, Settings.limit_amount)
            db = DatabaseAPI(db_path)
            db.insert_profile(user_name, user_info['bio'],
                              user_info['bio_url'], user_info['alias'],
                              user_info['num_of_posts'],
                              int(user_info['followers']['count']),
                              int(user_info['following']['count']),
                              1 if user_info['isprivate'] else 0)
            for link, index in indexed_links.items():
                db.insert_post(user_name, link, index, preview_images[link],
                               '', 0, 0)
        except Exception as e:
            print("Error with user '{}': {}".format(user_name, e))
            sys.exit(1)

        print("\nFinished crawling profile links.")

    except KeyboardInterrupt:
        print('Aborted...')

    finally:
        # browser.delete_all_cookies()
        browser.close()
Exemple #3
0
def find_real_fans(target_user='******'):
    followers_list = grab_followers(target_user)
    sleep(30)

    fan_list = {}
    try:
        browser = init_chromedriver(chrome_options, capabilities)
    except Exception as exc:
        print(exc)
        sys.exit()

    try:
        login(
            browser,
            Settings.login_username,
            Settings.login_password)

        for user in followers_list:
            print('Extracting information from ' + user)
            try:
                information = extract_information(browser, user)
                fan_list[user] = information
            except BaseException:
                print("Error with user " + user)
                sys.exit(1)

            Datasaver.save_profile_json(user, information)
        print("\nFinished.\n")

    except KeyboardInterrupt:
        print('Aborted...')

    finally:
        browser.delete_all_cookies()
        browser.close()

    df = pd.DataFrame(columns=['alias', 'private', 'num_posts', 'num_followers', 'num_following'])
    for id, element in enumerate(fan_list):
        alias = element
        is_private = fan_list[element]['isprivate']
        num_posts = fan_list[element]['num_of_posts']
        num_followers = fan_list[element]['followers']['count']
        num_following = fan_list[element]['following']['count']
        info = [alias, is_private, num_posts, num_followers, num_following]
        tmp = pd.DataFrame([info], columns=['alias', 'private', 'num_posts', 'num_followers', 'num_following'])
        df = df.append(tmp, ignore_index=True)
        print(id, info)

    df.to_csv('real_fans_of_{}.csv'.format(target_user), sep='\t', encoding='utf-8')
    return df
    def _create_proxy_browser(proxy_ip=None):
        chrome_options = Options()
        if proxy_ip is not None:
            chrome_options.add_argument('--proxy-server=%s' % proxy_ip)
        chrome_options.add_argument('--dns-prefetch-disable')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--lang=en-US')
        chrome_options.add_argument('--headless')
        chrome_options.add_experimental_option(
            'prefs', {'intl.accept_languages': 'en-US'})

        capabilities = DesiredCapabilities.CHROME

        try:
            return init_chromedriver(chrome_options, capabilities)
        except Exception as exc:
            print(exc)
            sys.exit()
Exemple #5
0
prefs = {
    'profile.managed_default_content_settings.images': 2,
    'disk-cache-size': 4096
}
chromeOptions.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--dns-prefetch-disable')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--lang=en-US')
chrome_options.add_argument('--headless')
chrome_options.add_experimental_option('prefs',
                                       {'intl.accept_languages': 'en-US'})

capabilities = DesiredCapabilities.CHROME

try:
    browser = init_chromedriver(chrome_options, capabilities)
except Exception as exc:
    print(exc)
    sys.exit()

try:
    usernames = get_all_user_names()

    for username in usernames:
        print('Extracting information from ' + username)
        information = []
        user_commented_list = []
        try:
            if len(Settings.login_username) != 0:
                login(browser, Settings.login_username,
                      Settings.login_password)