def legacy_metadata_fixer(formatted_directories: dict, api: object) -> media_types: legacy_metadatas = formatted_directories["legacy_metadatas"] new_metadata_directory = formatted_directories["metadata_directory"] old_metadata_directory = os.path.dirname( legacy_metadatas["legacy_metadata"]) metadata_name = os.path.basename(f"{old_metadata_directory}.json") q = [] for key, legacy_directory in legacy_metadatas.items(): if legacy_directory == formatted_directories["metadata_directory"]: continue if os.path.exists(legacy_directory): folders = os.listdir(legacy_directory) metadata_names = media_types() metadata_names = [f"{k}.json" for k, v in metadata_names] print type_one_files = main_helper.remove_mandatory_files( folders, keep=metadata_names) new_format = [] for type_one_file in type_one_files: legacy_metadata_path = os.path.join(legacy_directory, type_one_file) legacy_metadata = import_archive(legacy_metadata_path) if "type" not in legacy_metadata: legacy_type_key = type_one_file.removesuffix(".json") legacy_metadata["type"] = legacy_type_key print for key, status in legacy_metadata.items(): if key == "type": continue status.sort(key=lambda x: x["post_id"], reverse=False) legacy_metadata[key] = [ list(g) for k, g in groupby(status, key=lambda x: x["post_id"]) ] status = legacy_metadata[key] new_format.append(legacy_metadata) old_metadata_object = prepare_metadata(new_format, api=api).metadata if legacy_directory != new_metadata_directory: import_path = os.path.join(legacy_directory, metadata_name) new_metadata_set = import_archive(import_path) if new_metadata_set: new_metadata_object2 = prepare_metadata(new_metadata_set, api=api).metadata print old_metadata_object = compare_metadata( new_metadata_object2, old_metadata_object) print q.append(old_metadata_object) print print results = media_types() for merge_into in q: print results = compare_metadata(results, merge_into) print print return results
def legacy_metadata_fixer(formatted_directories: dict, api: object) -> tuple[create_metadata, list]: delete_legacy_metadatas = [] legacy_metadatas = formatted_directories["legacy_metadatas"] new_metadata_directory = formatted_directories["metadata_directory"] old_metadata_directory = os.path.dirname( legacy_metadatas["legacy_metadata"]) metadata_name = os.path.basename(f"{old_metadata_directory}.json") q = [] for key, legacy_directory in legacy_metadatas.items(): if legacy_directory == formatted_directories["metadata_directory"]: continue if os.path.exists(legacy_directory): folders = os.listdir(legacy_directory) metadata_names = media_types() metadata_names = [f"{k}.json" for k, v in metadata_names] print type_one_files = main_helper.remove_mandatory_files( folders, keep=metadata_names) new_format = [] for type_one_file in type_one_files: api_type = type_one_file.removesuffix(".json") legacy_metadata_path = os.path.join( legacy_directory, type_one_file) legacy_metadata = import_archive(legacy_metadata_path) if legacy_metadata: delete_legacy_metadatas.append(legacy_metadata_path) legacy_metadata = create_metadata( api, legacy_metadata, api_type=api_type).convert() new_format.append(legacy_metadata) new_format = dict( merge({}, *new_format, strategy=Strategy.ADDITIVE)) old_metadata_object = create_metadata(api, new_format) if legacy_directory != new_metadata_directory: import_path = os.path.join(legacy_directory, metadata_name) new_metadata_set = import_archive( import_path) if new_metadata_set: new_metadata_object2 = create_metadata( api, new_metadata_set) old_metadata_object = compare_metadata( new_metadata_object2, old_metadata_object) q.append(old_metadata_object) print print results = create_metadata() for merge_into in q: print results = compare_metadata( results, merge_into) print print return results, delete_legacy_metadatas
def process_metadata(api: start, new_metadata, formatted_directories, subscription, api_type, api_path, archive_path, site_name): print("Processing metadata.") legacy_metadata_object = legacy_metadata_fixer( formatted_directories, api) new_metadata_object = create_metadata( api, new_metadata, standard_format=True) if legacy_metadata_object: print("Merging new metadata with legacy metadata.") new_metadata_object = compare_metadata( new_metadata_object, legacy_metadata_object) old_metadata_set = import_archive(archive_path) old_metadata_object = create_metadata( api, old_metadata_set, api_type=api_type) if old_metadata_object: print("Merging new metadata with old metadata.") new_metadata_object = compare_metadata( new_metadata_object, old_metadata_object) if not subscription.download_info: subscription.download_info["metadata_locations"] = {} subscription.download_info["directory"] = j_directory subscription.download_info["webhook"] = webhook subscription.download_info["metadata_locations"][api_type] = archive_path subscription.set_scraped(api_type, new_metadata_object) print("Renaming files.") new_metadata_object = ofrenamer.start( subscription, api_type, api_path, site_name, json_settings) subscription.set_scraped(api_type, new_metadata_object) print("Finished processing metadata.") return new_metadata_object
def start_datascraper(api, identifier, site_name, choice_type=None): print("Scrape Processing") subscription = api.get_subscription(identifier) if not subscription: return [False, subscription] post_count = subscription.postsCount user_id = str(subscription.id) avatar = subscription.avatar username = subscription.username link = subscription.link formatted_directories = main_helper.format_directories( j_directory, site_name, username) metadata_directory = formatted_directories["metadata_directory"] archive_path = os.path.join(metadata_directory, "Mass Messages.json") if subscription.is_me: imported = import_archive(archive_path) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, archive_path, json_settings, rename=False) info = {} info["download"] = prepare_download.start( username=username, link=link, image_url=avatar, post_count=post_count, webhook=webhook) print("Name: "+username) api_array = scrape_choice(api, subscription) api_array = format_options(api_array, "apis") apis = api_array[0] api_string = api_array[1] if not json_settings["auto_scrape_apis"]: print("Apis: "+api_string) value = int(input().strip()) else: value = 0 if value: apis = [apis[value]] else: apis.pop(0) for item in apis: print("Type: "+item["api_type"]) only_links = item["api_array"]["only_links"] post_count = str(item["api_array"]["post_count"]) item["api_array"]["username"] = username item["api_array"]["subscription"] = subscription api_type = item["api_type"] results = prepare_scraper( api, site_name, item) print if any(x for x in subscription.scraped): subscription.download_info["directory"] = j_directory subscription.download_info["model_directory"] = os.path.join( j_directory, username) subscription.download_info["webhook"] = webhook print("Scrape Completed"+"\n") return [True, info]
def paid_content_scraper(api): paid_contents = api.get_paid_content(refresh=False) results = [] for paid_content in paid_contents: author = paid_content.get("author") author = paid_content.get("fromUser", author) subscription = create_subscription(author) subscription.sessions = api.sessions subscription.download_info["directory"] = j_directory username = subscription.username model_directory = os.path.join(j_directory, username) metadata_folder = os.path.join(model_directory, "Metadata") api_type = paid_content["responseType"].capitalize()+"s" metadata_path = os.path.join( metadata_folder, api_type+".json") site_name = "OnlyFans" media_type = format_media_types() formatted_directories = main_helper.format_directories( j_directory, site_name, username, media_type, api_type) new_item = media_scraper([paid_content], api, formatted_directories, username, api_type) for directory in new_item["directories"]: os.makedirs(directory, exist_ok=True) download_metadata = prepare_metadata(new_item).metadata subscription.set_scraped(api_type, download_metadata) metadata = prepare_metadata(new_item, export=True).metadata metadata = jsonpickle.encode( metadata, unpicklable=False) new_metadata = jsonpickle.decode(metadata) old_metadata = import_archive(metadata_path) if old_metadata: old_metadata = metadata_fixer(directory=metadata_path.replace( ".json", ""), metadata_types=old_metadata) unrefined = compare_metadata( new_metadata, old_metadata, new_chain=True) unrefined = prepare_metadata(unrefined, export=True).metadata new_metadata = jsonpickle.encode( unrefined, unpicklable=False) new_metadata = jsonpickle.decode(new_metadata) results.append(new_metadata) os.makedirs(model_directory, exist_ok=True) a = export_archive(new_metadata, metadata_path, json_settings) x = download_media(api, subscription) return results
def account_setup(api): status = False auth = api.login() if auth: profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, auth["username"]) profile_metadata_directory = os.path.join(profile_directory, "Metadata") metadata_filepath = os.path.join(profile_metadata_directory, "Mass Messages.json") print if auth["isPerformer"]: imported = import_archive(metadata_filepath) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, metadata_filepath, json_settings) # chats = api.get_chats() subscriptions = api.get_subscriptions() status = True return status
def process_metadata(api: start, new_metadata, formatted_directories, subscription, api_type, api_path, archive_path, site_name): legacy_metadata_object = legacy_metadata_fixer(formatted_directories, api) new_metadata_object = prepare_metadata(new_metadata, api=api).metadata new_metadata_object = compare_metadata(new_metadata_object, legacy_metadata_object) old_metadata_set = import_archive(archive_path) old_metadata_object = prepare_metadata(old_metadata_set, api=api).metadata new_metadata_object = compare_metadata(new_metadata_object, old_metadata_object) if not subscription.download_info: subscription.download_info["metadata_locations"] = {} subscription.download_info["directory"] = j_directory subscription.download_info["webhook"] = webhook subscription.download_info["metadata_locations"][api_type] = archive_path subscription.set_scraped(api_type, new_metadata_object) new_metadata_object = ofrenamer.start(subscription, api_type, api_path, site_name, json_settings) subscription.set_scraped(api_type, new_metadata_object) return new_metadata_object
def account_setup(api: start, identifiers: list = [], jobs: dict = {}): status = False subscriptions = [] authed = api.login() if isinstance(authed, create_auth): profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, authed.username) profile_metadata_directory = os.path.join( profile_directory, "Metadata") metadata_filepath = os.path.join( profile_metadata_directory, "Mass Messages.json") print if authed.isPerformer: imported = import_archive(metadata_filepath) if "auth" in imported: imported = imported["auth"] mass_messages = api.get_mass_messages(resume=imported) if mass_messages: main_helper.export_data(mass_messages, metadata_filepath) # chats = api.get_chats() if identifiers or jobs["scrape_names"]: subscriptions += manage_subscriptions( api, -1, identifiers=identifiers) # collection = [] # for subscription in subscriptions: # delattr(subscription,"download_info") # delattr(subscription,"sessions") # delattr(subscription,"scraped") # delattr(subscription,"is_me") # delattr(subscription,"links") # collection.append(subscription) # collection = jsonpickle.encode( # collection, unpicklable=False) # collection = jsonpickle.decode(collection) # export_archive(collection, metadata_filepath, # json_settings) status = True return status, subscriptions
def account_setup(api: start, identifier=""): status = False authed = api.login() if isinstance(authed, create_auth): jobs = json_settings["jobs"] profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, authed.username) profile_metadata_directory = os.path.join( profile_directory, "Metadata") metadata_filepath = os.path.join( profile_metadata_directory, "Mass Messages.json") print if authed.isPerformer: imported = import_archive(metadata_filepath) mass_messages = api.get_mass_messages(resume=imported) export_archive(mass_messages, metadata_filepath, json_settings) # chats = api.get_chats() if not identifier and jobs["scrape_names"]: # metadata_filepath = os.path.join( # profile_metadata_directory, "Subscriptions.json") # imported = import_archive(metadata_filepath) subscriptions = api.get_subscriptions() # collection = [] # for subscription in subscriptions: # delattr(subscription,"download_info") # delattr(subscription,"sessions") # delattr(subscription,"scraped") # delattr(subscription,"is_me") # delattr(subscription,"links") # collection.append(subscription) # collection = jsonpickle.encode( # collection, unpicklable=False) # collection = jsonpickle.decode(collection) # export_archive(collection, metadata_filepath, # json_settings) status = True return status
def metadata_fixer(directory="", metadata_types=[], export=True): metadata_path = directory+".json" if not metadata_types: metadata_types = import_archive(metadata_path) new_format = {} if isinstance(metadata_types, list): force = True for metadata_type in metadata_types: new_format[metadata_type["type"]] = metadata_type metadata_type.pop("type") else: force = False new_format = metadata_types new_format_copied = copy.deepcopy(new_format) for key, value in new_format.items(): for key2, posts in value.items(): if key2 != "valid": continue for post in posts: for media in post: media["media_id"] = media.get("media_id",None) if "link" in media: media["links"] = [media["link"]] media.pop("link") print directory = media["directory"] if directory: media["directory"] = os.path.realpath( media["directory"]) print print print print hashed = DeepHash(new_format)[new_format] hashed2 = DeepHash(new_format_copied)[new_format_copied] if (force or hashed != hashed2) and export: with open(metadata_path, 'w') as outfile: json.dump(new_format, outfile) return new_format
def legacy_metadata_fixer(legacy_directory, new_metadata): if os.path.exists(legacy_directory): folders = os.listdir(legacy_directory) new_format = [] for folder in (x for x in folders if "desktop.ini" not in folders): legacy_metadata_path = os.path.join(legacy_directory, folder) metadata_type = import_archive(legacy_metadata_path) valid = metadata_type["valid"] valid.sort(key=lambda x: x["post_id"], reverse=False) metadata_type["valid"] = [list(g) for k, g in groupby( valid, key=lambda x: x["post_id"])] new_format.append(metadata_type) old_metadata = metadata_fixer(metadata_types=new_format, export=False) old_metadata = prepare_metadata(old_metadata).metadata old_metadata = jsonpickle.encode(old_metadata, unpicklable=False) old_metadata = jsonpickle.decode(old_metadata) new_metadata = compare_metadata( new_metadata, old_metadata, new_chain=True) new_metadata = prepare_metadata(new_metadata).metadata new_metadata = jsonpickle.encode(new_metadata, unpicklable=False) new_metadata = jsonpickle.decode(new_metadata) return new_metadata
def process_legacy_metadata(api: start, new_metadata_set, formatted_directories, subscription, api_type, api_path, archive_path, site_name): print("Processing metadata.") delete_metadatas = [] archive_path = archive_path.replace("db", "json") legacy_metadata_object, delete_legacy_metadatas = legacy_metadata_fixer( formatted_directories, api) if delete_legacy_metadatas: print("Merging new metadata with legacy metadata.") old_metadata_set = import_archive(archive_path) old_metadata_object = create_metadata( api, old_metadata_set, api_type=api_type) if old_metadata_set: print("Merging new metadata with old metadata.") old_metadata_object = compare_metadata( old_metadata_object, legacy_metadata_object) old_metadata_set = [] for media_type, value in old_metadata_object.content: for status, value2 in value: for value3 in value2: x = value3.medias item = value3.convert(keep_empty_items=True) old_metadata_set.append(item) print print print subscription.set_scraped(api_type, old_metadata_object) if old_metadata_set: delete_metadatas.append(archive_path) final_set = [] for item in old_metadata_set: x = [x for x in new_metadata_set if x["post_id"] == item["post_id"]] if not x: final_set.append(item) print print print("Finished processing metadata.") return final_set, delete_metadatas
def process_mass_messages(api: start, subscription, metadata_directory, mass_messages) -> list: def compare_message(queue_id, remote_messages): for message in remote_messages: if "isFromQueue" in message and message["isFromQueue"]: if queue_id == message["queueId"]: return message print print global_found = [] chats = [] session = api.sessions[0] salt = json_global_settings["random_string"] encoded = f"{session.ip}{salt}" encoded = encoded.encode('utf-8') hash = hashlib.md5(encoded).hexdigest() profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, subscription.username) profile_metadata_directory = os.path.join(profile_directory, "Metadata") mass_message_path = os.path.join( profile_metadata_directory, "Mass Messages.json") chats_path = os.path.join(profile_metadata_directory, "Chats.json") if os.path.exists(chats_path): chats = import_archive(chats_path) date_object = datetime.today() date_string = date_object.strftime("%d-%m-%Y %H:%M:%S") for mass_message in mass_messages: if "status" not in mass_message: mass_message["status"] = "" if "found" not in mass_message: mass_message["found"] = {} if "hashed_ip" not in mass_message: mass_message["hashed_ip"] = "" mass_message["hashed_ip"] = mass_message.get("hashed_ip", hash) mass_message["date_hashed"] = mass_message.get( "date_hashed", date_string) if mass_message["isCanceled"]: continue queue_id = mass_message["id"] text = mass_message["textCropped"] text = html.unescape(text) mass_found = mass_message["found"] media_type = mass_message.get("mediaType") media_types = mass_message.get("mediaTypes") if mass_found or (not media_type and not media_types): continue identifier = None if chats: list_chats = chats for chat in list_chats: identifier = chat["identifier"] messages = chat["messages"]["list"] mass_found = compare_message(queue_id, messages) if mass_found: mass_message["found"] = mass_found mass_message["status"] = True break if not mass_found: list_chats = subscription.search_messages(text=text, limit=2) if not list_chats: continue for item in list_chats["list"]: user = item["withUser"] identifier = user["id"] messages = [] print("Getting Messages") keep = ["id", "username"] list_chats2 = [ x for x in chats if x["identifier"] == identifier] if list_chats2: chat2 = list_chats2[0] messages = chat2["messages"]["list"] messages = subscription.get_messages( identifier=identifier, resume=messages) for message in messages: message["withUser"] = { k: item["withUser"][k] for k in keep} message["fromUser"] = { k: message["fromUser"][k] for k in keep} mass_found = compare_message(queue_id, messages) if mass_found: mass_message["found"] = mass_found mass_message["status"] = True break else: item2 = {} item2["identifier"] = identifier item2["messages"] = subscription.get_messages( identifier=identifier) chats.append(item2) messages = item2["messages"]["list"] for message in messages: message["withUser"] = { k: item["withUser"][k] for k in keep} message["fromUser"] = { k: message["fromUser"][k] for k in keep} mass_found = compare_message(queue_id, messages) if mass_found: mass_message["found"] = mass_found mass_message["status"] = True break print print print if not mass_found: mass_message["status"] = False main_helper.export_data(chats, chats_path) for mass_message in mass_messages: found = mass_message["found"] if found and found["media"]: user = found["withUser"] identifier = user["id"] print date_hashed_object = datetime.strptime( mass_message["date_hashed"], "%d-%m-%Y %H:%M:%S") next_date_object = date_hashed_object+timedelta(days=1) print if mass_message["hashed_ip"] != hash or date_object > next_date_object: print("Getting Message By ID") x = subscription.get_message_by_id( identifier=identifier, identifier2=found["id"], limit=1) new_found = x["result"]["list"][0] new_found["withUser"] = found["withUser"] mass_message["found"] = new_found mass_message["hashed_ip"] = hash mass_message["date_hashed"] = date_string global_found.append(found) print print main_helper.export_data( mass_messages, mass_message_path) return global_found
def prepare_scraper(api, site_name, item): authed = api.auth sessions = api.sessions api_type = item["api_type"] api_array = item["api_array"] link = api_array["api_link"] subscription = api_array["subscription"] locations = api_array["media_types"] username = api_array["username"] directory = api_array["directory"] api_count = api_array["post_count"] master_set = [] media_set = [] metadata_set = [] pool = multiprocessing() formatted_directories = main_helper.format_directories( j_directory, site_name, username, locations, api_type) model_directory = formatted_directories["model_directory"] api_directory = formatted_directories["api_directory"] metadata_directory = formatted_directories["metadata_directory"] archive_directory = os.path.join(metadata_directory, api_type) archive_path = archive_directory+".json" imported = import_archive(archive_path) legacy_metadata_directory = os.path.join(api_directory, "Metadata") if api_type == "Profile": profile_scraper(api, directory, username) return if api_type == "Stories": master_set = subscription.get_stories() highlights = subscription.get_highlights() valid_highlights = [] for highlight in highlights: highlight = subscription.get_highlights( hightlight_id=highlight["id"]) valid_highlights.append(highlight) master_set.extend(valid_highlights) print if api_type == "Posts": master_set = subscription.get_posts() if api_type == "Archived": master_set = subscription.get_archived(api) if api_type == "Messages": unrefined_set = subscription.get_messages() if "list" in unrefined_set: unrefined_set = unrefined_set["list"] if subscription.is_me: mass_messages = authed["mass_messages"] unrefined_set2 = process_mass_message(api, subscription, metadata_directory, mass_messages) unrefined_set += unrefined_set2 print master_set = [unrefined_set] master_set2 = master_set parent_type = "" if "Archived" == api_type: unrefined_set = [] for master_set3 in master_set2: parent_type = master_set3["type"] results = master_set3["results"] unrefined_result = pool.starmap(media_scraper, product( results, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set.append(unrefined_result) unrefined_set = list(chain(*unrefined_set)) else: unrefined_set = pool.starmap(media_scraper, product( master_set2, [api], [formatted_directories], [username], [api_type], [parent_type])) unrefined_set = [x for x in unrefined_set] metadata_set = main_helper.format_media_set(unrefined_set) if not metadata_set: print("No "+api_type+" Found.") delattr(subscription.scraped, api_type) if metadata_set: if export_metadata: os.makedirs(metadata_directory, exist_ok=True) old_metadata = metadata_fixer(archive_directory) old_metadata_set = prepare_metadata(old_metadata).metadata old_metadata_set2 = jsonpickle.encode( old_metadata_set, unpicklable=False) old_metadata_set2 = jsonpickle.decode(old_metadata_set2) metadata_set = compare_metadata(metadata_set, old_metadata_set2) metadata_set = prepare_metadata(metadata_set).metadata metadata_set2 = jsonpickle.encode(metadata_set, unpicklable=False) metadata_set2 = jsonpickle.decode(metadata_set2) metadata_set2 = main_helper.filter_metadata(metadata_set2) metadata_set2 = legacy_metadata_fixer( legacy_metadata_directory, metadata_set2) main_helper.export_archive( metadata_set2, archive_directory, json_settings, legacy_directory=legacy_metadata_directory) else: metadata_set = prepare_metadata(metadata_set).metadata subscription = api.get_subscription(username) subscription.set_scraped(api_type, metadata_set) return [subscription.scraped]