def start_datascraper(session, board_name, site_name, link_type, choice_type=None): print("Scrape Processing") info = link_check(session, board_name) if not info["exists"]: return [False, info] print("Board: " + board_name) array = scrape_choice(board_name) pool = multiprocessing() threads = board_scraper(session, array[0], "") archive_threads = board_scraper(session, array[1], "archive") threads = threads + archive_threads print("Original Count: "+str(len(threads))) formatted_directories = main_helper.format_directories( j_directory, site_name, board_name) model_directory = formatted_directories["model_directory"] metadata_directory = formatted_directories["metadata_directory"] api_directory = formatted_directories["api_directory"] directory = model_directory print("Scraping Threads") threads = pool.starmap(thread_scraper, product(threads, [board_name], [session], [directory])) threads = [x for x in threads if x is not None] post_count = len(threads) print("Valid Count: "+str(post_count)) print("Downloading Media") count_results = str(len([x for x in threads if x is None])) print("Invalid Count: "+count_results) num = random.randrange(0, 200) avatar = f"https://s.4cdn.org/image/title/{num}.png" link = info["link"] info["download"] = prepare_download.start( username=board_name, link=link, image_url=avatar, post_count=post_count, webhook=webhook) info["download"].others.append([threads, session, directory, board_name]) # When profile is done scraping, this function will return True return [True, info]
def start_datascraper(api, identifier, site_name, choice_type=None): print("Scrape Processing") subscription = api.get_subscription(identifier) if not subscription: return [False, subscription] post_count = subscription.postsCount user_id = str(subscription.id) avatar = subscription.avatar username = subscription.username link = subscription.link formatted_directories = main_helper.format_directories( j_directory, site_name, username) metadata_directory = formatted_directories["metadata_directory"] archive_path = os.path.join(metadata_directory, "Mass Messages.json") if subscription.is_me: imported = import_archive(archive_path) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, archive_path, json_settings, rename=False) info = {} info["download"] = prepare_download.start( username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook) print("Name: "+username) api_array = scrape_choice(api, subscription) api_array = format_options(api_array, "apis") apis = api_array[0] api_string = api_array[1] if not json_settings["auto_scrape_apis"]: print("Apis: "+api_string) value = int(input().strip()) else: value = 0 if value: apis = [apis[value]] else: apis.pop(0) for item in apis: print("Type: "+item["api_type"]) only_links = item["api_array"]["only_links"] post_count = str(item["api_array"]["post_count"]) item["api_array"]["username"] = username item["api_array"]["subscription"] = subscription api_type = item["api_type"] results = prepare_scraper( api, site_name, item) print if any(x for x in subscription.scraped): subscription.download_info["directory"] = j_directory subscription.download_info["model_directory"] = os.path.join( j_directory, username) subscription.download_info["webhook"] = webhook print("Scrape Completed"+"\n") return [True, info]
def paid_content_scraper(api): paid_contents = api.get_paid_content(refresh=False) results = [] for paid_content in paid_contents: author = paid_content.get("author") author = paid_content.get("fromUser", author) subscription = create_subscription(author) subscription.sessions = api.sessions subscription.download_info["directory"] = j_directory username = subscription.username model_directory = os.path.join(j_directory, username) metadata_folder = os.path.join(model_directory, "Metadata") api_type = paid_content["responseType"].capitalize()+"s" metadata_path = os.path.join( metadata_folder, api_type+".json") site_name = "OnlyFans" media_type = format_media_types() formatted_directories = main_helper.format_directories( j_directory, site_name, username, media_type, api_type) new_item = media_scraper([paid_content], api, formatted_directories, username, api_type) for directory in new_item["directories"]: os.makedirs(directory, exist_ok=True) download_metadata = prepare_metadata(new_item).metadata subscription.set_scraped(api_type, download_metadata) metadata = prepare_metadata(new_item, export=True).metadata metadata = jsonpickle.encode( metadata, unpicklable=False) new_metadata = jsonpickle.decode(metadata) old_metadata = import_archive(metadata_path) if old_metadata: old_metadata = metadata_fixer(directory=metadata_path.replace( ".json", ""), metadata_types=old_metadata) unrefined = compare_metadata( new_metadata, old_metadata, new_chain=True) unrefined = prepare_metadata(unrefined, export=True).metadata new_metadata = jsonpickle.encode( unrefined, unpicklable=False) new_metadata = jsonpickle.decode(new_metadata) results.append(new_metadata) os.makedirs(model_directory, exist_ok=True) a = export_archive(new_metadata, metadata_path, json_settings) x = download_media(api, subscription) return results
def prepare_scraper(sessions, site_name, item): api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = multiprocessing() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] metadata_directory = formatted_directories["metadata_directory"] api_directory = formatted_directories["api_directory"] if api_type == "Posts": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace("offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace("offset=0", "offset=" + str(b))) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = main_helper.json_request(sessions[0], link) if "error" in r: return for item in r["list"]: link2 = "https://stars.avn.com/api2/v2/stories/collections/" + \ str(item["id"]) master_set.append(link2) master_set2 = main_helper.assign_session(master_set, sessions) media_set = {} media_set["set"] = [] media_set["found"] = False count = len(master_set2) max_attempts = 100 for attempt in list(range(max_attempts)): print("Scrape Attempt: " + str(attempt + 1) + "/" + str(max_attempts)) media_set2 = pool.starmap( media_scraper, product(master_set2, [sessions], [formatted_directories], [username], [api_type])) media_set["set"].extend(media_set2) faulty = [x for x in media_set2 if not x] if not faulty: print("Found: " + api_type) media_set["found"] = True break else: if count < 2: break num = len(faulty) * 100 print("Missing " + str(num) + " Posts... Retrying...") master_set2 = main_helper.restore_missing_data( master_set2, media_set2) if not media_set["found"]: print("No " + api_type + " Found.") media_set = media_set["set"] main_helper.delete_empty_directories(api_directory) media_set = [x for x in media_set] media_set = main_helper.format_media_set(media_set) metadata_set = media_set if export_metadata: metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]] for item in metadata_set: if item["valid"] or item["invalid"]: legacy_metadata = formatted_directories["legacy_metadata"] if delete_legacy_metadata: if os.path.isdir(legacy_metadata): shutil.rmtree(legacy_metadata) if metadata_set: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) metadata_set_copy = copy.deepcopy(metadata_set) metadata_set = main_helper.filter_metadata(metadata_set_copy) main_helper.export_archive(metadata_set, archive_directory, json_settings) return [media_set, directory]
def prepare_scraper(sessions, site_name, item): api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = ThreadPool() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] api_directory = formatted_directories["api_directory"] metadata_directory = formatted_directories["metadata_directory"] legacy_metadata_directory = os.path.join(api_directory, "Metadata") # legacy_metadata = main_helper.legacy_metadata(legacy_metadata_directory) if api_type == "Profile": profile_scraper(link, sessions[0], directory, username) return if api_type == "Posts": num = 100 link = link.replace("limit=0", "limit="+str(num)) original_link = link ceil = math.ceil(api_count / num) a = list(range(ceil)) for b in a: b = b * num master_set.append(link.replace( "offset=0", "offset=" + str(b))) if api_type == "Archived": ceil = math.ceil(api_count / 100) a = list(range(ceil)) for b in a: b = b * 100 master_set.append(link.replace( "offset=0", "offset=" + str(b))) def xmessages(link): f_offset_count = 0 while True: y = main_helper.json_request(sessions[0], link) if not y: return if "list" in y: if y["list"]: master_set.append(link) if y["hasMore"]: f_offset_count2 = f_offset_count+100 f_offset_count = f_offset_count2-100 link = link.replace( "offset=" + str(f_offset_count), "offset=" + str(f_offset_count2)) f_offset_count = f_offset_count2 else: break else: break else: break def process_chats(subscriber): fool = subscriber["withUser"] fool_id = str(fool["id"]) link_2 = f"https://onlyfans.com/api2/v2/chats/{fool_id}/messages?limit=100&offset=0&order=desc&app-token={app_token}" xmessages(link_2) if api_type == "Messages": xmessages(link) if api_type == "Mass Messages": results = [] max_threads = multiprocessing.cpu_count() offset_count = 0 offset_count2 = max_threads while True: def process_messages(link, session): y = main_helper.json_request(session, link) if y and "error" not in y: return y else: return [] link_list = [link.replace( "offset=0", "offset="+str(i*30)) for i in range(offset_count, offset_count2)] link_list = pool.starmap(process_messages, product( link_list, [sessions[0]])) if all(not result for result in link_list): break link_list2 = list(chain(*link_list)) results.append(link_list2) offset_count = offset_count2 offset_count2 = offset_count*2 unsorted_messages = list(chain(*results)) unsorted_messages.sort(key=lambda x: x["id"]) messages = unsorted_messages def process_mass_messages(message, limit): text = message["textCropped"].replace("&", "") link_2 = "https://onlyfans.com/api2/v2/chats?limit="+limit+"&offset=0&filter=&order=activity&query=" + \ text+"&app-token="+app_token y = main_helper.json_request(sessions[0], link_2) if None == y or "error" in y: return [] return y limit = "10" if len(messages) > 99: limit = "2" subscribers = pool.starmap(process_mass_messages, product( messages, [limit])) subscribers = filter(None, subscribers) subscribers = [ item for sublist in subscribers for item in sublist["list"]] seen = set() subscribers = [x for x in subscribers if x["withUser"] ["id"] not in seen and not seen.add(x["withUser"]["id"])] x = pool.starmap(process_chats, product( subscribers)) if api_type == "Stories": master_set.append(link) if api_type == "Highlights": r = main_helper.json_request(sessions[0], link) if "error" in r: return for item in r: link2 = f"https://onlyfans.com/api2/v2/stories/highlights/{item['id']}?app-token={app_token}" master_set.append(link2) master_set2 = main_helper.assign_session(master_set, sessions) media_set = [] count = len(master_set2) max_attempts = 100 for attempt in list(range(max_attempts)): print("Scrape Attempt: "+str(attempt+1)+"/"+str(max_attempts)) media_set2 = pool.starmap(media_scraper, product( master_set2, [sessions], [formatted_directories], [username], [api_type])) media_set.extend(media_set2) if count > 1: faulty = [x for x in media_set2 if not x] if not faulty: print("Found: "+api_type) break else: num = len(faulty)*100 print("Missing "+str(num)+" Posts... Retrying...") master_set2 = main_helper.restore_missing_data( master_set2, media_set2) else: print("No "+api_type+" Found.") break main_helper.delete_empty_directories(api_directory) media_set = [x for x in media_set] media_set = main_helper.format_media_set(media_set) metadata_set = media_set if export_metadata: print metadata_set = [x for x in metadata_set if x["valid"] or x["invalid"]] for item in metadata_set: if item["valid"] or item["invalid"]: legacy_metadata = formatted_directories["legacy_metadata"] if metadata_set: os.makedirs(metadata_directory, exist_ok=True) archive_directory = os.path.join(metadata_directory, api_type) metadata_set_copy = copy.deepcopy(metadata_set) metadata_set = main_helper.filter_metadata(metadata_set_copy) main_helper.export_archive( metadata_set, archive_directory, json_settings) return [media_set, directory]
def prepare_scraper(api, site_name, item): authed = api.auth sessions = api.sessions api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] subscription = api_array["subscription"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = multiprocessing() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] api_directory = formatted_directories["api_directory"] metadata_directory = formatted_directories["metadata_directory"] archive_directory = os.path.join(metadata_directory, api_type) archive_path = archive_directory+".json" imported = import_archive(archive_path) legacy_metadata_directory = os.path.join(api_directory, "Metadata") if api_type == "Profile": profile_scraper(api, directory, username) return if api_type == "Stories": master_set = subscription.get_stories() highlights = subscription.get_highlights() valid_highlights = [] for highlight in highlights: highlight = subscription.get_highlights( hightlight_id=highlight["id"]) valid_highlights.append(highlight) master_set.extend(valid_highlights) print if api_type == "Posts": master_set = subscription.get_posts() if api_type == "Archived": master_set = subscription.get_archived(api) if api_type == "Messages": unrefined_set = subscription.get_messages() if "list" in unrefined_set: unrefined_set = unrefined_set["list"] if subscription.is_me: mass_messages = authed["mass_messages"] unrefined_set2 = process_mass_message(api, subscription, metadata_directory, mass_messages) unrefined_set += unrefined_set2 print master_set = [unrefined_set] master_set2 = master_set parent_type = "" if "Archived" == api_type: unrefined_set = [] for master_set3 in master_set2: parent_type = master_set3["type"] results = master_set3["results"] unrefined_result = pool.starmap(media_scraper, product( results, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set.append(unrefined_result) unrefined_set = list(chain(*unrefined_set)) else: unrefined_set = pool.starmap(media_scraper, product( master_set2, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set = [x for x in unrefined_set] metadata_set = main_helper.format_media_set(unrefined_set) if not metadata_set: print("No "+api_type+" Found.") delattr(subscription.scraped, api_type) if metadata_set: if export_metadata: os.makedirs(metadata_directory, exist_ok=True) old_metadata = metadata_fixer(archive_directory) old_metadata_set = prepare_metadata(old_metadata).metadata old_metadata_set2 = jsonpickle.encode( old_metadata_set, unpicklable=False) old_metadata_set2 = jsonpickle.decode(old_metadata_set2) metadata_set = compare_metadata(metadata_set, old_metadata_set2) metadata_set = prepare_metadata(metadata_set).metadata metadata_set2 = jsonpickle.encode(metadata_set, unpicklable=False) metadata_set2 = jsonpickle.decode(metadata_set2) metadata_set2 = main_helper.filter_metadata(metadata_set2) metadata_set2 = legacy_metadata_fixer( legacy_metadata_directory, metadata_set2) main_helper.export_archive( metadata_set2, archive_directory, json_settings, legacy_directory=legacy_metadata_directory) else: metadata_set = prepare_metadata(metadata_set).metadata subscription = api.get_subscription(username) subscription.set_scraped(api_type, metadata_set) return [subscription.scraped]