def prepare_scraper(sessions, site_name, item): api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = multiprocessing() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] metadata_directory = formatted_directories["metadata_directory"] api_directory = formatted_directories["api_directory"] if api_type == "Posts": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace("offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace("offset=0", "offset=" + str(b))) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = main_helper.json_request(sessions[0], link) if "error" in r: return for item in r["list"]: link2 = "https://stars.avn.com/api2/v2/stories/collections/" + \ str(item["id"]) master_set.append(link2) master_set2 = main_helper.assign_session(master_set, sessions) media_set = {} media_set["set"] = [] media_set["found"] = False count = len(master_set2) max_attempts = 100 for attempt in list(range(max_attempts)): print("Scrape Attempt: " + str(attempt + 1) + "/" + str(max_attempts)) media_set2 = pool.starmap( media_scraper, product(master_set2, [sessions], [formatted_directories], [username], [api_type])) media_set["set"].extend(media_set2) faulty = [x for x in media_set2 if not x] if not faulty: print("Found: " + api_type) media_set["found"] = True break else: if count < 2: break num = len(faulty) * 100 print("Missing " + str(num) + " Posts... Retrying...") master_set2 = main_helper.restore_missing_data( master_set2, media_set2) if not media_set["found"]: print("No " + api_type + " Found.") media_set = media_set["set"] main_helper.delete_empty_directories(api_directory) media_set = [x for x in media_set] media_set = main_helper.format_media_set(media_set) metadata_set = media_set if export_metadata: metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]] for item in metadata_set: if item["valid"] or item["invalid"]: legacy_metadata = formatted_directories["legacy_metadata"] if delete_legacy_metadata: if os.path.isdir(legacy_metadata): shutil.rmtree(legacy_metadata) if metadata_set: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) metadata_set_copy = copy.deepcopy(metadata_set) metadata_set = main_helper.filter_metadata(metadata_set_copy) main_helper.export_archive(metadata_set, archive_directory, json_settings) return [media_set, directory]
def prepare_scraper(sessions, site_name, item): api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = ThreadPool() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] api_directory = formatted_directories["api_directory"] metadata_directory = formatted_directories["metadata_directory"] legacy_metadata_directory = os.path.join(api_directory, "Metadata") # legacy_metadata = main_helper.legacy_metadata(legacy_metadata_directory) if api_type == "Profile": profile_scraper(link, sessions[0], directory, username) return if api_type == "Posts": num = 100 link = link.replace("limit=0", "limit="+str(num)) original_link = link ceil = math.ceil(api_count / num) a = list(range(ceil)) for b in a: b = b * num master_set.append(link.replace( "offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace( "offset=0", "offset=" + str(b))) def xmessages(link): f_offset_count = 0 while True: y = main_helper.json_request(sessions[0], link) if not y: return if "list" in y: if y["list"]: master_set.append(link) if y["hasMore"]: f_offset_count2 = f_offset_count+100 f_offset_count = f_offset_count2-100 link = link.replace( "offset=" + str(f_offset_count), "offset=" + str(f_offset_count2)) f_offset_count = f_offset_count2 else: break else: break else: break def process_chats(subscriber): fool = subscriber["withUser"] fool_id = str(fool["id"]) link_2 = f"https://onlyfans.com/api2/v2/chats/{fool_id}/messages?limit=100&offset=0&order=desc&app-token={app_token}" xmessages(link_2) if api_type == "Messages": xmessages(link) if api_type == "Mass Messages": results = [] max_threads = multiprocessing.cpu_count() offset_count = 0 offset_count2 = max_threads while True: def process_messages(link, session): y = main_helper.json_request(session, link) if y and "error" not in y: return y else: return [] link_list = [link.replace( "offset=0", "offset="+str(i*30)) for i in range(offset_count, offset_count2)] link_list = pool.starmap(process_messages, product( link_list, [sessions[0]])) if all(not result for result in link_list): break link_list2 = list(chain(*link_list)) results.append(link_list2) offset_count = offset_count2 offset_count2 = offset_count*2 unsorted_messages = list(chain(*results)) unsorted_messages.sort(key=lambda x: x["id"]) messages = unsorted_messages def process_mass_messages(message, limit): text = message["textCropped"].replace("&", "") link_2 = "https://onlyfans.com/api2/v2/chats?limit="+limit+"&offset=0&filter=&order=activity&query=" + \ text+"&app-token="+app_token y = main_helper.json_request(sessions[0], link_2) if None == y or "error" in y: return [] return y limit = "10" if len(messages) > 99: limit = "2" subscribers = pool.starmap(process_mass_messages, product( messages, [limit])) subscribers = filter(None, subscribers) subscribers = [ item for sublist in subscribers for item in sublist["list"]] seen = set() subscribers = [x for x in subscribers if x["withUser"] ["id"] not in seen and not seen.add(x["withUser"]["id"])] x = pool.starmap(process_chats, product( subscribers)) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = main_helper.json_request(sessions[0], link) if "error" in r: return for item in r: link2 = f"https://onlyfans.com/api2/v2/stories/highlights/{item['id']}?app-token={app_token}" master_set.append(link2) master_set2 = main_helper.assign_session(master_set, sessions) media_set = [] count = len(master_set2) max_attempts = 100 for attempt in list(range(max_attempts)): print("Scrape Attempt: "+str(attempt+1)+"/"+str(max_attempts)) media_set2 = pool.starmap(media_scraper, product( master_set2, [sessions], [formatted_directories], [username], [api_type])) media_set.extend(media_set2) if count > 1: faulty = [x for x in media_set2 if not x] if not faulty: print("Found: "+api_type) break else: num = len(faulty)*100 print("Missing "+str(num)+" Posts... Retrying...") master_set2 = main_helper.restore_missing_data( master_set2, media_set2) else: print("No "+api_type+" Found.") break main_helper.delete_empty_directories(api_directory) media_set = [x for x in media_set] media_set = main_helper.format_media_set(media_set) metadata_set = media_set if export_metadata: print metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]] for item in metadata_set: if item["valid"] or item["invalid"]: legacy_metadata = formatted_directories["legacy_metadata"] if metadata_set: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) metadata_set_copy = copy.deepcopy(metadata_set) metadata_set = main_helper.filter_metadata(metadata_set_copy) main_helper.export_archive( metadata_set, archive_directory, json_settings) return [media_set, directory]
def start_datascraper(): parser = ArgumentParser() parser.add_argument("-m", "--metadata", action='store_true', help="only exports metadata") parser.add_argument("-n", "--number", default=100000) args = parser.parse_args() number = int(args.number) if args.metadata: print("Exporting Metadata Only") log_error = main_helper.setup_logger('errors', 'errors.log') console = logging.StreamHandler() console.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(name)s %(message)s') console.setFormatter(formatter) logging.getLogger("").addHandler(console) # root = os.getcwd() config_path = os.path.join('.settings', 'config.json') json_config, json_config2 = main_helper.get_config(config_path) json_settings = json_config["settings"] json_sites = json_config["supported"] infinite_loop = json_settings["infinite_loop"] global_user_agent = json_settings['global_user_agent'] domain = json_settings["auto_site_choice"] path = os.path.join('.settings', 'extra_auth.json') extra_auth_config, extra_auth_config2 = main_helper.get_config(path) exit_on_completion = json_settings['exit_on_completion'] loop_timeout = json_settings['loop_timeout'] main_helper.assign_vars(json_config) string = "Site: " site_names = [] bl = ["patreon"] if not domain: site_count = len(json_sites) count = 0 for x in json_sites: if x in bl: continue string += str(count) + " = " + x site_names.append(x) if count + 1 != site_count: string += " | " count += 1 string += "x = Exit" try: while True: if domain: site_name = domain else: print(string) x = input() if x == "x": break x = int(x) site_name = site_names[x] site_name_lower = site_name.lower() json_auth_array = [json_sites[site_name_lower]["auth"]] json_site_settings = json_sites[site_name_lower]["settings"] auto_scrape_names = json_site_settings["auto_scrape_names"] extra_auth_settings = json_sites[site_name_lower][ "extra_auth_settings"] if "extra_auth_settings" in json_sites[ site_name_lower] else { "extra_auth": False } extra_auth = extra_auth_settings["extra_auth"] if extra_auth: choose_auth = extra_auth_settings["choose_auth"] merge_auth = extra_auth_settings["merge_auth"] json_auth_array += extra_auth_config["supported"][ site_name_lower]["auths"] if choose_auth: json_auth_array = main_helper.choose_auth(json_auth_array) session_array = [] x = onlyfans subscription_array = [] legacy = True if site_name_lower == "onlyfans": legacy = False site_name = "OnlyFans" subscription_array = [] auth_count = -1 for json_auth in json_auth_array: auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] x = onlyfans x.assign_vars(json_auth, json_config, json_site_settings, site_name) sessions = x.create_session() if not sessions: print("Unable to create session") continue session = x.create_auth(sessions, user_agent, json_auth, max_auth=1) session_array.append(session) if not session["sessions"]: continue # x.get_paid_posts(session["sessions"][0]) print cookies = session["sessions"][0].cookies.get_dict() auth_id = cookies["auth_id"] json_auth['auth_id'] = auth_id json_auth['auth_uniq_'] = cookies["auth_uniq_" + auth_id] json_auth['auth_hash'] = cookies["auth_hash"] json_auth['sess'] = cookies["sess"] json_auth['fp'] = cookies["fp"] if json_config != json_config2: main_helper.update_config(json_config) me_api = session["me_api"] array = x.get_subscriptions(session["sessions"][0], session["subscriber_count"], me_api, auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") if site_name_lower == "patreon": legacy = False site_name = "Patreon" subscription_array = [] auth_count = -1 x = patreon x.assign_vars(json_config, json_site_settings, site_name) for json_auth in json_auth_array: auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] session = x.create_session() session = x.create_auth(session, user_agent, json_auth) session_array.append(session) if not session["session"]: continue cookies = session["session"].cookies.get_dict() json_auth['session_id'] = cookies["session_id"] if json_config != json_config2: main_helper.update_config(json_config) me_api = session["me_api"] array = x.get_subscriptions(session["session"], auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") elif site_name_lower == "starsavn": legacy = False site_name = "StarsAVN" subscription_array = [] auth_count = -1 for json_auth in json_auth_array: auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] x = starsavn x.assign_vars(json_config, json_site_settings, site_name) sessions = x.create_session() if not sessions: print("Unable to create session") continue session = x.create_auth(sessions, user_agent, json_auth, max_auth=1) session_array.append(session) if not session["sessions"]: continue me_api = session["me_api"] array = x.get_subscriptions(session["sessions"][0], session["subscriber_count"], me_api, auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") elif site_name == "fourchan": x = fourchan site_name = "4Chan" x.assign_vars(json_config, json_site_settings, site_name) session_array = [x.create_session()] array = x.get_subscriptions() subscription_array = x.format_options(array) elif site_name == "bbwchan": x = bbwchan site_name = "BBWChan" x.assign_vars(json_config, json_site_settings, site_name) session_array = [x.create_session()] array = x.get_subscriptions() subscription_array = x.format_options(array) names = subscription_array[0] if names: print("Names: Username = username | " + subscription_array[1]) length = len(names) - 1 if not auto_scrape_names and number == 100000: value = "2" value = input().strip() if value.isdigit(): if value == "0": names = names[1:] else: names = [names[int(value)]] else: names = [name for name in names if value in name[1]] elif number != 100000 and number - 1 > length: print("Number out of Range") quit() elif number != 100000: value = number names = [names[int(value)]] else: value = 0 names = names[1:] else: print("There's nothing to scrape.") continue archive_time = timeit.default_timer() download_list = [] app_token = "" for name in names: # Extra Auth Support if not legacy: json_auth = json_auth_array[name[0]] app_token = json_auth[ "app_token"] if "app_token" in json_auth else "" auth_count = name[0] if "session" in session_array[auth_count]: session = session_array[auth_count]["session"] else: session = session_array[auth_count]["sessions"] name = name[-1] else: session = session_array[0]["session"] main_helper.assign_vars(json_config) username = main_helper.parse_links(site_name_lower, name) result = x.start_datascraper(session, username, site_name, app_token, choice_type=value) if result[0]: download_list.append(result) for item in download_list: result = item[1] if not result["subbed"]: continue download = result["download"] others = download.others if not others: continue model_directory = os.path.join(others[0][2], others[0][3]) if not args.metadata: for arg in others: x.download_media(*arg) main_helper.delete_empty_directories(model_directory) main_helper.send_webhook(download) stop_time = str(int(timeit.default_timer() - archive_time) / 60)[:4] print('Archive Completed in ' + stop_time + ' Minutes') if exit_on_completion: print("Now exiting.") exit(0) elif not infinite_loop: print("Input anything to continue") input() elif loop_timeout: print('Pausing scraper for ' + loop_timeout + ' seconds.') time.sleep(int(loop_timeout)) except Exception as e: log_error.exception(e) input()
def start_datascraper(): parser = ArgumentParser() parser.add_argument("-m", "--metadata", action='store_true', help="only exports metadata") args = parser.parse_args() if args.metadata: print("Exporting Metadata Only") log_error = main_helper.setup_logger('errors', 'errors.log') console = logging.StreamHandler() console.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(name)s %(message)s') console.setFormatter(formatter) logging.getLogger("").addHandler(console) # root = os.getcwd() config_path = os.path.join('.settings', 'config.json') json_config, json_config2 = main_helper.get_config(config_path) json_settings = json_config["settings"] json_sites = json_config["supported"] infinite_loop = json_settings["infinite_loop"] global_user_agent = json_settings['global_user_agent'] domain = json_settings["auto_site_choice"] path = os.path.join('.settings', 'extra_auth.json') extra_auth_config, extra_auth_config2 = main_helper.get_config(path) exit_on_completion = json_settings['exit_on_completion'] loop_timeout = json_settings['loop_timeout'] main_helper.assign_vars(json_config) string, site_names = module_chooser(domain, json_sites) try: while True: if domain: if site_names: site_name = domain else: print(string) continue else: print(string) x = input() if x == "x": break x = int(x) site_name = site_names[x] site_name_lower = site_name.lower() json_auth_array = [json_sites[site_name_lower]["auth"]] json_site_settings = json_sites[site_name_lower]["settings"] auto_scrape_names = json_site_settings["auto_scrape_names"] extra_auth_settings = json_sites[site_name_lower][ "extra_auth_settings"] if "extra_auth_settings" in json_sites[ site_name_lower] else { "extra_auth": False } extra_auth = extra_auth_settings["extra_auth"] if extra_auth: choose_auth = extra_auth_settings["choose_auth"] merge_auth = extra_auth_settings["merge_auth"] json_auth_array += extra_auth_config["supported"][ site_name_lower]["auths"] if choose_auth: json_auth_array = main_helper.choose_auth(json_auth_array) apis = [] module = m_onlyfans subscription_array = [] legacy = True original_sessions = api_helper.create_session( settings=json_settings) if not original_sessions: print("Unable to create session") continue archive_time = timeit.default_timer() if site_name_lower == "onlyfans": site_name = "OnlyFans" subscription_array = [] auth_count = -1 for json_auth in json_auth_array: api = OnlyFans.start(original_sessions) auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] module = m_onlyfans module.assign_vars(json_auth, json_config, json_site_settings, site_name) api.set_auth_details(**json_auth, global_user_agent=user_agent) setup = module.account_setup(api) if not setup: continue jobs = json_site_settings["jobs"] if jobs["scrape_names"]: array = module.manage_subscriptions(api, auth_count) subscription_array += array if jobs["scrape_paid_content"]: paid_contents = api.get_paid_content() paid_content = module.paid_content_scraper(api) apis.append(api) subscription_list = module.format_options( subscription_array, "usernames") x = main_helper.process_names(module, subscription_list, auto_scrape_names, json_auth_array, apis, json_config, site_name_lower, site_name) print elif site_name_lower == "starsavn": site_name = "StarsAVN" subscription_array = [] auth_count = -1 for json_auth in json_auth_array: sessions = api_helper.copy_sessions(original_sessions) api = StarsAVN.start(sessions) auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] module = m_starsavn module.assign_vars(json_auth, json_config, json_site_settings, site_name) api.set_auth_details(**json_auth, global_user_agent=user_agent) setup = module.account_setup(api) if not setup: continue jobs = json_site_settings["jobs"] if jobs["scrape_names"]: array = module.manage_subscriptions(api, auth_count) subscription_array += array if jobs["scrape_paid_content"]: paid_content = module.paid_content_scraper(api) apis.append(api) subscription_array = module.format_options( subscription_array, "usernames") for api in apis: subscriptions = api.get_subscriptions(refresh=False) for subscription in subscriptions: download_info = subscription.download_info if download_info: module.download_media(api, subscription) main_helper.delete_empty_directories( download_info["model_directory"]) main_helper.send_webhook(subscription) stop_time = str(int(timeit.default_timer() - archive_time) / 60)[:4] print('Archive Completed in ' + stop_time + ' Minutes') if exit_on_completion: print("Now exiting.") exit(0) elif not infinite_loop: print("Input anything to continue") input() elif loop_timeout: print('Pausing scraper for ' + loop_timeout + ' seconds.') time.sleep(int(loop_timeout)) except Exception as e: log_error.exception(e) input()