Esempio n. 1
0
def prepare_scraper(sessions, site_name, item):
    api_type = item["api_type"]
    api_array = item["api_array"]
    link = api_array["api_link"]
    locations = api_array["media_types"]
    username = api_array["username"]
    directory = api_array["directory"]
    api_count = api_array["post_count"]
    master_set = []
    media_set = []
    metadata_set = []
    pool = multiprocessing()
    formatted_directories = main_helper.format_directories(
        j_directory, site_name, username, locations, api_type)
    model_directory = formatted_directories["model_directory"]
    metadata_directory = formatted_directories["metadata_directory"]
    api_directory = formatted_directories["api_directory"]
    if api_type == "Posts":
        ceil = math.ceil(api_count / 100)
        a = list(range(ceil))
        for b in a:
            b = b * 100
            master_set.append(link.replace("offset=0", "offset=" + str(b)))
    if api_type == "Archived":
        ceil = math.ceil(api_count / 100)
        a = list(range(ceil))
        for b in a:
            b = b * 100
            master_set.append(link.replace("offset=0", "offset=" + str(b)))
    if api_type == "Stories":
        master_set.append(link)
    if api_type == "Highlights":
        r = main_helper.json_request(sessions[0], link)
        if "error" in r:
            return
        for item in r["list"]:
            link2 = "https://stars.avn.com/api2/v2/stories/collections/" + \
                str(item["id"])
            master_set.append(link2)
    master_set2 = main_helper.assign_session(master_set, sessions)
    media_set = {}
    media_set["set"] = []
    media_set["found"] = False
    count = len(master_set2)
    max_attempts = 100
    for attempt in list(range(max_attempts)):
        print("Scrape Attempt: " + str(attempt + 1) + "/" + str(max_attempts))
        media_set2 = pool.starmap(
            media_scraper,
            product(master_set2, [sessions], [formatted_directories],
                    [username], [api_type]))
        media_set["set"].extend(media_set2)
        faulty = [x for x in media_set2 if not x]
        if not faulty:
            print("Found: " + api_type)
            media_set["found"] = True
            break
        else:
            if count < 2:
                break
            num = len(faulty) * 100
            print("Missing " + str(num) + " Posts... Retrying...")
            master_set2 = main_helper.restore_missing_data(
                master_set2, media_set2)
    if not media_set["found"]:
        print("No " + api_type + " Found.")
    media_set = media_set["set"]
    main_helper.delete_empty_directories(api_directory)
    media_set = [x for x in media_set]
    media_set = main_helper.format_media_set(media_set)

    metadata_set = media_set
    if export_metadata:
        metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]]
        for item in metadata_set:
            if item["valid"] or item["invalid"]:
                legacy_metadata = formatted_directories["legacy_metadata"]
                if delete_legacy_metadata:
                    if os.path.isdir(legacy_metadata):
                        shutil.rmtree(legacy_metadata)
        if metadata_set:
            os.makedirs(metadata_directory, exist_ok=True)
            archive_directory = os.path.join(metadata_directory, api_type)
            metadata_set_copy = copy.deepcopy(metadata_set)
            metadata_set = main_helper.filter_metadata(metadata_set_copy)
            main_helper.export_archive(metadata_set, archive_directory,
                                       json_settings)
    return [media_set, directory]
Esempio n. 2
0
def prepare_scraper(sessions, site_name, item):
    api_type = item["api_type"]
    api_array = item["api_array"]
    link = api_array["api_link"]
    locations = api_array["media_types"]
    username = api_array["username"]
    directory = api_array["directory"]
    api_count = api_array["post_count"]
    master_set = []
    media_set = []
    metadata_set = []
    pool = ThreadPool()
    formatted_directories = main_helper.format_directories(
        j_directory, site_name, username, locations, api_type)
    model_directory = formatted_directories["model_directory"]
    api_directory = formatted_directories["api_directory"]
    metadata_directory = formatted_directories["metadata_directory"]
    legacy_metadata_directory = os.path.join(api_directory, "Metadata")
    # legacy_metadata = main_helper.legacy_metadata(legacy_metadata_directory)
    if api_type == "Profile":
        profile_scraper(link, sessions[0], directory, username)
        return
    if api_type == "Posts":
        num = 100
        link = link.replace("limit=0", "limit="+str(num))
        original_link = link
        ceil = math.ceil(api_count / num)
        a = list(range(ceil))
        for b in a:
            b = b * num
            master_set.append(link.replace(
                "offset=0", "offset=" + str(b)))
    if api_type == "Archived":
        ceil = math.ceil(api_count / 100)
        a = list(range(ceil))
        for b in a:
            b = b * 100
            master_set.append(link.replace(
                "offset=0", "offset=" + str(b)))

    def xmessages(link):
        f_offset_count = 0
        while True:
            y = main_helper.json_request(sessions[0], link)
            if not y:
                return
            if "list" in y:
                if y["list"]:
                    master_set.append(link)
                    if y["hasMore"]:
                        f_offset_count2 = f_offset_count+100
                        f_offset_count = f_offset_count2-100
                        link = link.replace(
                            "offset=" + str(f_offset_count), "offset=" + str(f_offset_count2))
                        f_offset_count = f_offset_count2
                    else:
                        break
                else:
                    break
            else:
                break

    def process_chats(subscriber):
        fool = subscriber["withUser"]
        fool_id = str(fool["id"])
        link_2 = f"https://onlyfans.com/api2/v2/chats/{fool_id}/messages?limit=100&offset=0&order=desc&app-token={app_token}"
        xmessages(link_2)
    if api_type == "Messages":
        xmessages(link)
    if api_type == "Mass Messages":
        results = []
        max_threads = multiprocessing.cpu_count()
        offset_count = 0
        offset_count2 = max_threads
        while True:
            def process_messages(link, session):
                y = main_helper.json_request(session, link)
                if y and "error" not in y:
                    return y
                else:
                    return []
            link_list = [link.replace(
                "offset=0", "offset="+str(i*30)) for i in range(offset_count, offset_count2)]
            link_list = pool.starmap(process_messages, product(
                link_list, [sessions[0]]))
            if all(not result for result in link_list):
                break
            link_list2 = list(chain(*link_list))

            results.append(link_list2)
            offset_count = offset_count2
            offset_count2 = offset_count*2
        unsorted_messages = list(chain(*results))
        unsorted_messages.sort(key=lambda x: x["id"])
        messages = unsorted_messages

        def process_mass_messages(message, limit):
            text = message["textCropped"].replace("&", "")
            link_2 = "https://onlyfans.com/api2/v2/chats?limit="+limit+"&offset=0&filter=&order=activity&query=" + \
                text+"&app-token="+app_token
            y = main_helper.json_request(sessions[0], link_2)
            if None == y or "error" in y:
                return []
            return y
        limit = "10"
        if len(messages) > 99:
            limit = "2"
        subscribers = pool.starmap(process_mass_messages, product(
            messages, [limit]))
        subscribers = filter(None, subscribers)
        subscribers = [
            item for sublist in subscribers for item in sublist["list"]]
        seen = set()
        subscribers = [x for x in subscribers if x["withUser"]
                       ["id"] not in seen and not seen.add(x["withUser"]["id"])]
        x = pool.starmap(process_chats, product(
            subscribers))
    if api_type == "Stories":
        master_set.append(link)
    if api_type == "Highlights":
        r = main_helper.json_request(sessions[0], link)
        if "error" in r:
            return
        for item in r:
            link2 = f"https://onlyfans.com/api2/v2/stories/highlights/{item['id']}?app-token={app_token}"
            master_set.append(link2)
    master_set2 = main_helper.assign_session(master_set, sessions)
    media_set = []
    count = len(master_set2)
    max_attempts = 100
    for attempt in list(range(max_attempts)):
        print("Scrape Attempt: "+str(attempt+1)+"/"+str(max_attempts))
        media_set2 = pool.starmap(media_scraper, product(
            master_set2, [sessions], [formatted_directories], [username], [api_type]))
        media_set.extend(media_set2)
        if count > 1:
            faulty = [x for x in media_set2 if not x]
            if not faulty:
                print("Found: "+api_type)
                break
            else:
                num = len(faulty)*100
                print("Missing "+str(num)+" Posts... Retrying...")
                master_set2 = main_helper.restore_missing_data(
                    master_set2, media_set2)
        else:
            print("No "+api_type+" Found.")
            break
    main_helper.delete_empty_directories(api_directory)
    media_set = [x for x in media_set]
    media_set = main_helper.format_media_set(media_set)

    metadata_set = media_set
    if export_metadata:
        print
        metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]]
        for item in metadata_set:
            if item["valid"] or item["invalid"]:
                legacy_metadata = formatted_directories["legacy_metadata"]
        if metadata_set:
            os.makedirs(metadata_directory, exist_ok=True)
            archive_directory = os.path.join(metadata_directory, api_type)
            metadata_set_copy = copy.deepcopy(metadata_set)
            metadata_set = main_helper.filter_metadata(metadata_set_copy)
            main_helper.export_archive(
                metadata_set, archive_directory, json_settings)
    return [media_set, directory]
def start_datascraper():
    parser = ArgumentParser()
    parser.add_argument("-m",
                        "--metadata",
                        action='store_true',
                        help="only exports metadata")
    parser.add_argument("-n", "--number", default=100000)
    args = parser.parse_args()
    number = int(args.number)
    if args.metadata:
        print("Exporting Metadata Only")
    log_error = main_helper.setup_logger('errors', 'errors.log')
    console = logging.StreamHandler()
    console.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s %(levelname)s %(name)s %(message)s')
    console.setFormatter(formatter)
    logging.getLogger("").addHandler(console)
    # root = os.getcwd()
    config_path = os.path.join('.settings', 'config.json')
    json_config, json_config2 = main_helper.get_config(config_path)
    json_settings = json_config["settings"]
    json_sites = json_config["supported"]
    infinite_loop = json_settings["infinite_loop"]
    global_user_agent = json_settings['global_user_agent']
    domain = json_settings["auto_site_choice"]
    path = os.path.join('.settings', 'extra_auth.json')
    extra_auth_config, extra_auth_config2 = main_helper.get_config(path)
    exit_on_completion = json_settings['exit_on_completion']
    loop_timeout = json_settings['loop_timeout']
    main_helper.assign_vars(json_config)

    string = "Site: "
    site_names = []
    bl = ["patreon"]
    if not domain:
        site_count = len(json_sites)
        count = 0
        for x in json_sites:
            if x in bl:
                continue
            string += str(count) + " = " + x
            site_names.append(x)
            if count + 1 != site_count:
                string += " | "

            count += 1
        string += "x = Exit"

    try:
        while True:
            if domain:
                site_name = domain
            else:
                print(string)
                x = input()
                if x == "x":
                    break
                x = int(x)
                site_name = site_names[x]
            site_name_lower = site_name.lower()

            json_auth_array = [json_sites[site_name_lower]["auth"]]

            json_site_settings = json_sites[site_name_lower]["settings"]
            auto_scrape_names = json_site_settings["auto_scrape_names"]
            extra_auth_settings = json_sites[site_name_lower][
                "extra_auth_settings"] if "extra_auth_settings" in json_sites[
                    site_name_lower] else {
                        "extra_auth": False
                    }
            extra_auth = extra_auth_settings["extra_auth"]
            if extra_auth:
                choose_auth = extra_auth_settings["choose_auth"]
                merge_auth = extra_auth_settings["merge_auth"]
                json_auth_array += extra_auth_config["supported"][
                    site_name_lower]["auths"]
                if choose_auth:
                    json_auth_array = main_helper.choose_auth(json_auth_array)
            session_array = []
            x = onlyfans
            subscription_array = []
            legacy = True
            if site_name_lower == "onlyfans":
                legacy = False
                site_name = "OnlyFans"
                subscription_array = []
                auth_count = -1
                for json_auth in json_auth_array:
                    auth_count += 1
                    user_agent = global_user_agent if not json_auth[
                        'user_agent'] else json_auth['user_agent']

                    x = onlyfans
                    x.assign_vars(json_auth, json_config, json_site_settings,
                                  site_name)
                    sessions = x.create_session()
                    if not sessions:
                        print("Unable to create session")
                        continue
                    session = x.create_auth(sessions,
                                            user_agent,
                                            json_auth,
                                            max_auth=1)
                    session_array.append(session)
                    if not session["sessions"]:
                        continue
                    # x.get_paid_posts(session["sessions"][0])
                    print
                    cookies = session["sessions"][0].cookies.get_dict()
                    auth_id = cookies["auth_id"]
                    json_auth['auth_id'] = auth_id
                    json_auth['auth_uniq_'] = cookies["auth_uniq_" + auth_id]
                    json_auth['auth_hash'] = cookies["auth_hash"]
                    json_auth['sess'] = cookies["sess"]
                    json_auth['fp'] = cookies["fp"]
                    if json_config != json_config2:
                        main_helper.update_config(json_config)
                    me_api = session["me_api"]
                    array = x.get_subscriptions(session["sessions"][0],
                                                session["subscriber_count"],
                                                me_api, auth_count)
                    subscription_array += array
                subscription_array = x.format_options(subscription_array,
                                                      "usernames")
            if site_name_lower == "patreon":
                legacy = False
                site_name = "Patreon"
                subscription_array = []
                auth_count = -1
                x = patreon
                x.assign_vars(json_config, json_site_settings, site_name)
                for json_auth in json_auth_array:
                    auth_count += 1
                    user_agent = global_user_agent if not json_auth[
                        'user_agent'] else json_auth['user_agent']

                    session = x.create_session()
                    session = x.create_auth(session, user_agent, json_auth)
                    session_array.append(session)
                    if not session["session"]:
                        continue
                    cookies = session["session"].cookies.get_dict()
                    json_auth['session_id'] = cookies["session_id"]
                    if json_config != json_config2:
                        main_helper.update_config(json_config)
                    me_api = session["me_api"]
                    array = x.get_subscriptions(session["session"], auth_count)
                    subscription_array += array
                subscription_array = x.format_options(subscription_array,
                                                      "usernames")
            elif site_name_lower == "starsavn":
                legacy = False
                site_name = "StarsAVN"
                subscription_array = []
                auth_count = -1
                for json_auth in json_auth_array:
                    auth_count += 1
                    user_agent = global_user_agent if not json_auth[
                        'user_agent'] else json_auth['user_agent']

                    x = starsavn
                    x.assign_vars(json_config, json_site_settings, site_name)
                    sessions = x.create_session()
                    if not sessions:
                        print("Unable to create session")
                        continue
                    session = x.create_auth(sessions,
                                            user_agent,
                                            json_auth,
                                            max_auth=1)
                    session_array.append(session)
                    if not session["sessions"]:
                        continue

                    me_api = session["me_api"]
                    array = x.get_subscriptions(session["sessions"][0],
                                                session["subscriber_count"],
                                                me_api, auth_count)
                    subscription_array += array
                subscription_array = x.format_options(subscription_array,
                                                      "usernames")
            elif site_name == "fourchan":
                x = fourchan
                site_name = "4Chan"
                x.assign_vars(json_config, json_site_settings, site_name)
                session_array = [x.create_session()]
                array = x.get_subscriptions()
                subscription_array = x.format_options(array)
            elif site_name == "bbwchan":
                x = bbwchan
                site_name = "BBWChan"
                x.assign_vars(json_config, json_site_settings, site_name)
                session_array = [x.create_session()]
                array = x.get_subscriptions()
                subscription_array = x.format_options(array)
            names = subscription_array[0]
            if names:
                print("Names: Username = username | " + subscription_array[1])
                length = len(names) - 1
                if not auto_scrape_names and number == 100000:
                    value = "2"
                    value = input().strip()
                    if value.isdigit():
                        if value == "0":
                            names = names[1:]
                        else:
                            names = [names[int(value)]]
                    else:
                        names = [name for name in names if value in name[1]]
                elif number != 100000 and number - 1 > length:
                    print("Number out of Range")
                    quit()

                elif number != 100000:
                    value = number
                    names = [names[int(value)]]
                else:
                    value = 0
                    names = names[1:]
            else:
                print("There's nothing to scrape.")
                continue
            archive_time = timeit.default_timer()
            download_list = []
            app_token = ""
            for name in names:
                # Extra Auth Support
                if not legacy:
                    json_auth = json_auth_array[name[0]]
                    app_token = json_auth[
                        "app_token"] if "app_token" in json_auth else ""
                    auth_count = name[0]
                    if "session" in session_array[auth_count]:
                        session = session_array[auth_count]["session"]
                    else:
                        session = session_array[auth_count]["sessions"]
                    name = name[-1]
                else:
                    session = session_array[0]["session"]
                main_helper.assign_vars(json_config)
                username = main_helper.parse_links(site_name_lower, name)
                result = x.start_datascraper(session,
                                             username,
                                             site_name,
                                             app_token,
                                             choice_type=value)
                if result[0]:
                    download_list.append(result)
            for item in download_list:
                result = item[1]
                if not result["subbed"]:
                    continue
                download = result["download"]
                others = download.others
                if not others:
                    continue
                model_directory = os.path.join(others[0][2], others[0][3])
                if not args.metadata:
                    for arg in others:
                        x.download_media(*arg)
                main_helper.delete_empty_directories(model_directory)
                main_helper.send_webhook(download)
            stop_time = str(int(timeit.default_timer() - archive_time) /
                            60)[:4]
            print('Archive Completed in ' + stop_time + ' Minutes')
            if exit_on_completion:
                print("Now exiting.")
                exit(0)
            elif not infinite_loop:
                print("Input anything to continue")
                input()
            elif loop_timeout:
                print('Pausing scraper for ' + loop_timeout + ' seconds.')
                time.sleep(int(loop_timeout))
    except Exception as e:
        log_error.exception(e)
        input()
Esempio n. 4
0
def start_datascraper():
    parser = ArgumentParser()
    parser.add_argument("-m",
                        "--metadata",
                        action='store_true',
                        help="only exports metadata")
    args = parser.parse_args()
    if args.metadata:
        print("Exporting Metadata Only")
    log_error = main_helper.setup_logger('errors', 'errors.log')
    console = logging.StreamHandler()
    console.setLevel(logging.DEBUG)
    formatter = logging.Formatter(
        '%(asctime)s %(levelname)s %(name)s %(message)s')
    console.setFormatter(formatter)
    logging.getLogger("").addHandler(console)
    # root = os.getcwd()
    config_path = os.path.join('.settings', 'config.json')
    json_config, json_config2 = main_helper.get_config(config_path)
    json_settings = json_config["settings"]
    json_sites = json_config["supported"]
    infinite_loop = json_settings["infinite_loop"]
    global_user_agent = json_settings['global_user_agent']
    domain = json_settings["auto_site_choice"]
    path = os.path.join('.settings', 'extra_auth.json')
    extra_auth_config, extra_auth_config2 = main_helper.get_config(path)
    exit_on_completion = json_settings['exit_on_completion']
    loop_timeout = json_settings['loop_timeout']
    main_helper.assign_vars(json_config)

    string, site_names = module_chooser(domain, json_sites)
    try:
        while True:
            if domain:
                if site_names:
                    site_name = domain
                else:
                    print(string)
                    continue
            else:
                print(string)
                x = input()
                if x == "x":
                    break
                x = int(x)
                site_name = site_names[x]
            site_name_lower = site_name.lower()

            json_auth_array = [json_sites[site_name_lower]["auth"]]

            json_site_settings = json_sites[site_name_lower]["settings"]
            auto_scrape_names = json_site_settings["auto_scrape_names"]
            extra_auth_settings = json_sites[site_name_lower][
                "extra_auth_settings"] if "extra_auth_settings" in json_sites[
                    site_name_lower] else {
                        "extra_auth": False
                    }
            extra_auth = extra_auth_settings["extra_auth"]
            if extra_auth:
                choose_auth = extra_auth_settings["choose_auth"]
                merge_auth = extra_auth_settings["merge_auth"]
                json_auth_array += extra_auth_config["supported"][
                    site_name_lower]["auths"]
                if choose_auth:
                    json_auth_array = main_helper.choose_auth(json_auth_array)
            apis = []
            module = m_onlyfans
            subscription_array = []
            legacy = True
            original_sessions = api_helper.create_session(
                settings=json_settings)
            if not original_sessions:
                print("Unable to create session")
                continue
            archive_time = timeit.default_timer()
            if site_name_lower == "onlyfans":
                site_name = "OnlyFans"
                subscription_array = []
                auth_count = -1
                for json_auth in json_auth_array:
                    api = OnlyFans.start(original_sessions)
                    auth_count += 1
                    user_agent = global_user_agent if not json_auth[
                        'user_agent'] else json_auth['user_agent']

                    module = m_onlyfans
                    module.assign_vars(json_auth, json_config,
                                       json_site_settings, site_name)
                    api.set_auth_details(**json_auth,
                                         global_user_agent=user_agent)
                    setup = module.account_setup(api)
                    if not setup:
                        continue
                    jobs = json_site_settings["jobs"]
                    if jobs["scrape_names"]:
                        array = module.manage_subscriptions(api, auth_count)
                        subscription_array += array
                    if jobs["scrape_paid_content"]:
                        paid_contents = api.get_paid_content()
                        paid_content = module.paid_content_scraper(api)
                    apis.append(api)
                subscription_list = module.format_options(
                    subscription_array, "usernames")
                x = main_helper.process_names(module, subscription_list,
                                              auto_scrape_names,
                                              json_auth_array, apis,
                                              json_config, site_name_lower,
                                              site_name)
                print
            elif site_name_lower == "starsavn":
                site_name = "StarsAVN"
                subscription_array = []
                auth_count = -1
                for json_auth in json_auth_array:
                    sessions = api_helper.copy_sessions(original_sessions)
                    api = StarsAVN.start(sessions)
                    auth_count += 1
                    user_agent = global_user_agent if not json_auth[
                        'user_agent'] else json_auth['user_agent']

                    module = m_starsavn
                    module.assign_vars(json_auth, json_config,
                                       json_site_settings, site_name)
                    api.set_auth_details(**json_auth,
                                         global_user_agent=user_agent)
                    setup = module.account_setup(api)
                    if not setup:
                        continue
                    jobs = json_site_settings["jobs"]
                    if jobs["scrape_names"]:
                        array = module.manage_subscriptions(api, auth_count)
                        subscription_array += array
                    if jobs["scrape_paid_content"]:
                        paid_content = module.paid_content_scraper(api)
                    apis.append(api)
                subscription_array = module.format_options(
                    subscription_array, "usernames")
            for api in apis:
                subscriptions = api.get_subscriptions(refresh=False)
                for subscription in subscriptions:
                    download_info = subscription.download_info
                    if download_info:
                        module.download_media(api, subscription)
                        main_helper.delete_empty_directories(
                            download_info["model_directory"])
                        main_helper.send_webhook(subscription)
            stop_time = str(int(timeit.default_timer() - archive_time) /
                            60)[:4]
            print('Archive Completed in ' + stop_time + ' Minutes')
            if exit_on_completion:
                print("Now exiting.")
                exit(0)
            elif not infinite_loop:
                print("Input anything to continue")
                input()
            elif loop_timeout:
                print('Pausing scraper for ' + loop_timeout + ' seconds.')
                time.sleep(int(loop_timeout))
    except Exception as e:
        log_error.exception(e)
        input()