async def profile_scraper(self, subscription: user_types): authed = subscription.get_authed() site_settings = authed.api.get_site_settings() if not (subscription.directory_manager and site_settings): return subscription_directory_manager = subscription.directory_manager subscription_username = subscription.username site_name = authed.api.site_name authed = subscription.get_authed() override_media_types: list[list[Any]] = [] avatar = subscription.avatar header = subscription.header override_media_types.extend([["Avatars", avatar], ["Headers", header]]) session = await authed.session_manager.create_client_session() progress_bar = None p_r = prepare_reformat() p_r.site_name = site_name p_r.model_username = subscription_username p_r.api_type = "Profile" p_r.text_length = site_settings.text_length p_r.directory = subscription_directory_manager.root_download_directory directory = await p_r.remove_non_unique( subscription_directory_manager, "file_directory_format" ) if not isinstance(directory, Path): return directory = directory.joinpath(p_r.api_type) for override_media_type in override_media_types: media_type = override_media_type[0] media_link = override_media_type[1] if not media_link: continue directory2 = directory.joinpath(media_type) directory2.mkdir(parents=True, exist_ok=True) download_path = directory2.joinpath(f"{media_link.split('/')[-2]}.jpg") if download_path.is_file(): continue response = await authed.session_manager.json_request( media_link, method="HEAD" ) if not response: continue if not progress_bar: progress_bar = main_helper.download_session() progress_bar.start(unit="B", unit_scale=True, miniters=1) progress_bar.update_total_size(response.content_length) response = await authed.session_manager.json_request( media_link, session, stream=True, json_format=False, ) await main_helper.write_data(response, download_path, progress_bar) await session.close() if progress_bar: progress_bar.close() # type: ignore
def start(subscription, api_type, api_path, site_name, json_settings): metadata = getattr(subscription.scraped, api_type) download_info = subscription.download_info root_directory = download_info["directory"] date_format = json_settings["date_format"] text_length = json_settings["text_length"] reformats = {} reformats["metadata_directory_format"] = json_settings[ "metadata_directory_format"] reformats["file_directory_format"] = json_settings["file_directory_format"] reformats["filename_format"] = json_settings["filename_format"] username = subscription.username option = {} option["site_name"] = site_name option["api_type"] = api_type option["username"] = username option["date_format"] = date_format option["maximum_length"] = text_length option["directory"] = root_directory formatted = format_types(reformats).check_unique() unique = formatted["unique"] for key, value in reformats.items(): key2 = getattr(unique, key)[0] reformats[key] = value.split(key2, 1)[0] + key2 print print a, base_directory, c = prepare_reformat(option, keep_vars=True).reformat(reformats) download_info["base_directory"] = base_directory print all_files = [] for root, subdirs, files in os.walk(base_directory): x = [os.path.join(root, x) for x in files] all_files.extend(x) for media_type, value in metadata.content: if media_type == "Texts": continue for status, value2 in value: fixed, new_directories = fix_directories(value2, root_directory, site_name, api_path, media_type, username, all_files, json_settings) for new_directory in new_directories: directory = os.path.abspath(new_directory) os.makedirs(directory, exist_ok=True) fixed2 = fix_metadata(fixed) setattr(value, status, fixed2) setattr( metadata.content, media_type, value, ) return metadata
def profile_scraper(api: start, site_name, api_type, username, text_length, base_directory): reformats = {} reformats["metadata_directory_format"] = json_settings[ "metadata_directory_format"] reformats["file_directory_format"] = json_settings["file_directory_format"] reformats["file_directory_format"] = reformats[ "file_directory_format"].replace("{value}", "") reformats["filename_format"] = json_settings["filename_format"] option = {} option["site_name"] = site_name option["api_type"] = api_type option["username"] = username option["date_format"] = date_format option["maximum_length"] = text_length option["directory"] = base_directory a, b, c = prepare_reformat(option, keep_vars=True).reformat(reformats) print y = api.get_subscription(identifier=username) override_media_types = [] avatar = y.avatar header = y.header if avatar: override_media_types.append(["Avatars", avatar]) if header: override_media_types.append(["Headers", header]) for override_media_type in override_media_types: new_dict = dict() media_type = override_media_type[0] media_link = override_media_type[1] new_dict["links"] = [media_link] directory2 = os.path.join(b, media_type) os.makedirs(directory2, exist_ok=True) download_path = os.path.join(directory2, media_link.split("/")[-2] + ".jpg") if not overwrite_files: if os.path.isfile(download_path): continue session = api.sessions[0] r = api.json_request(media_link, session, stream=True, json_format=False, sleep=False) if not isinstance(r, requests.Response): continue while True: downloader = main_helper.downloader(r, download_path) if not downloader: continue break
def fix_directories(post_item, base_directory, site_name, api_type, media_type, username, all_files, json_settings): new_directories = [] for posts in post_item: for media in posts: if media.links: path = urlparse.urlparse(media.links[0]).path else: path = media.filename new_filename = os.path.basename(path) filename, ext = os.path.splitext(new_filename) ext = ext.replace(".", "") file_directory_format = json_settings["file_directory_format"] filename_format = json_settings["filename_format"] date_format = json_settings["date_format"] text_length = json_settings["text_length"] download_path = base_directory today = datetime.today() today = today.strftime("%d-%m-%Y %H:%M:%S") new_dict = media.convert(keep_empty_items=True) option = {} option = option | new_dict option["site_name"] = site_name option["filename"] = filename option["api_type"] = api_type option["media_type"] = media_type option["ext"] = ext option["username"] = username option["date_format"] = date_format option["maximum_length"] = text_length option["directory"] = download_path prepared_format = prepare_reformat(option) file_directory = main_helper.reformat(prepared_format, file_directory_format) prepared_format.directory = file_directory old_filepath = "" x = [x for x in all_files if media.filename in x] if x: # media.downloaded = True old_filepath = x[0] old_filepath = os.path.abspath(old_filepath) print new_filepath = main_helper.reformat(prepared_format, filename_format) setattr(media, "old_filepath", old_filepath) setattr(media, "new_filepath", new_filepath) new_directories.append(os.path.dirname(new_filepath)) new_directories = list(set(new_directories)) return post_item, new_directories
def start(Session, parent_type, api_type, api_path, site_name, subscription, folder, json_settings): api_table = folder.api_table media_table = folder.media_table database_session = Session() result = database_session.query(api_table).all() metadata = getattr(subscription.scraped, api_type) download_info = subscription.download_info root_directory = download_info["directory"] date_format = json_settings["date_format"] text_length = json_settings["text_length"] reformats = {} reformats["metadata_directory_format"] = json_settings[ "metadata_directory_format"] reformats["file_directory_format"] = json_settings["file_directory_format"] reformats["filename_format"] = json_settings["filename_format"] username = subscription.username option = {} option["site_name"] = site_name option["api_type"] = api_type option["username"] = username option["date_format"] = date_format option["maximum_length"] = text_length option["directory"] = root_directory formatted = format_types(reformats).check_unique() unique = formatted["unique"] for key, value in reformats.items(): key2 = getattr(unique, key)[0] reformats[key] = value.split(key2, 1)[0] + key2 print print a, base_directory, c = prepare_reformat(option, keep_vars=True).reformat(reformats) download_info["base_directory"] = base_directory print all_files = [] for root, subdirs, files in os.walk(base_directory): x = [os.path.join(root, x) for x in files] all_files.extend(x) fixed, new_directories = fix_directories(result, all_files, database_session, folder, site_name, parent_type, api_type, username, root_directory, json_settings) database_session.close() return metadata
def format_directories(directory, site_name, username, unformatted, locations=[], api_type="") -> dict: x = {} option = {} option["site_name"] = site_name option["username"] = username option["directory"] = directory option["postedAt"] = datetime.today() option["date_format"] = date_format option["maximum_length"] = maximum_length prepared_format = prepare_reformat(option) legacy_model_directory = x["legacy_model_directory"] = os.path.join( directory, site_name, username) x["legacy_metadatas"] = {} x["legacy_metadatas"]["legacy_metadata"] = os.path.join( legacy_model_directory, api_type, "Metadata") x["legacy_metadatas"]["legacy_metadata2"] = os.path.join( legacy_model_directory, "Metadata") x["metadata_directory"] = main_helper.reformat(prepared_format, unformatted) x["download_directory"] = directory x["locations"] = [] for location in locations: directories = {} cats = ["Unsorted", "Free", "Paid"] for cat in cats: cat2 = cat if "Unsorted" in cat2: cat2 = "" path = os.path.join(api_type, cat2, location[0]) directories[cat.lower()] = path y = {} y["sorted_directories"] = directories y["media_type"] = location[0] y["alt_media_type"] = location[1] x["locations"].append(y) return x
async def start( subscription: user_types, api_type: str, Session: scoped_session, site_settings: SiteSettings, ): authed = subscription.get_authed() directory_manager = subscription.directory_manager api_table_ = user_database.table_picker(api_type) database_session: scoped_session = Session() # Slow authed_username = authed.username subscription_username = subscription.username site_name = authed.api.site_name p_r = prepare_reformat() p_r = await p_r.standard( site_name=site_name, profile_username=authed_username, user_username=subscription_username, date=datetime.today(), date_format=site_settings.date_format, text_length=site_settings.text_length, directory=directory_manager.root_metadata_directory, ) p_r.api_type = api_type result: list[api_table] = database_session.query(api_table_).all() metadata = getattr(subscription.temp_scraped, api_type) await fix_directories( result, subscription, database_session, api_type, ) database_session.close() return metadata
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""): new_set = {} new_set["content"] = [] directories = [] session = api.sessions[0] if api_type == "Stories": if "stories" in results: items = results["stories"] for item in items: item["text"] = results["title"] results = results["stories"] if api_type == "Archived": print pass if api_type == "Posts": print if api_type == "Messages": pass if not results or "error" in results: return new_set if "result" in results: session = results["session"] results = results["result"] if "error" in results: return new_set download_path = formatted_directories["download_directory"] for location in formatted_directories["locations"]: sorted_directories = copy.copy(location["sorted_directories"]) master_date = "01-01-0001 00:00:00" media_type = location["media_type"] alt_media_type = location["alt_media_type"] file_directory_format = json_settings["file_directory_format"] if api_type == "Archived": x = file_directory_format.split(os.sep) for y in x: substr = "{api_type}" if substr == y: new_path = os.path.join(substr, parent_type) file_directory_format = file_directory_format.replace( substr, new_path) break print print seperator = " | " print( f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.") for media_api in results: new_post = {} new_post["medias"] = [] rawText = media_api.get("rawText", "") text = media_api.get("text", "") final_text = rawText if rawText else text # if media_api["responseType"] == "post": # if media_api["isArchived"]: # pass if api_type == "Messages": media_api["rawText"] = media_api["text"] if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"] if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime( master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace(tzinfo=None).strftime( "%d-%m-%Y %H:%M:%S") master_date = date_string new_post["post_id"] = media_api["id"] new_post["text"] = final_text new_post["postedAt"] = date_string new_post["paid"] = False price = new_post["price"] = media_api["price"]if "price" in media_api else None if price == None: price = 0 canPurchase = media_api.get("canPurchase", None) canViewMedia = media_api.get("canViewMedia", None) if price: if not canPurchase or canViewMedia: new_post["paid"] = True for media in media_api["media"]: media_id = media["id"] date = "-001-11-30T00:00:00+00:00" size = 0 link = "" preview_link = "" if "source" in media: quality_key = "source" source = media[quality_key] link = source[quality_key] if link: if media["type"] == "video": qualities = media["videoSources"] qualities = dict( sorted(qualities.items(), reverse=False)) qualities[quality_key] = source[quality_key] for quality, quality_link in qualities.items(): video_quality_json = json_settings["video_quality"] video_quality_json = video_quality_json.removesuffix( "p") if quality == video_quality_json: if link: link = quality_link break print print print size = media["info"]["preview"]["size"] if "info" in media_api else 1 if "src" in media: link = media["src"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["createdAt"] matches = ["us", "uk", "ca", "ca2", "de"] if not link: continue url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_media = dict() new_media["media_id"] = media_id new_media["links"] = [] new_media["media_type"] = media_type for xlink in link, preview_link: if xlink: new_media["links"].append(xlink) break session.links.extend(new_media["links"]) if media["type"] not in alt_media_type: continue matches = [s for s in ignored_keywords if s in final_text] if matches: print("Matches: ", matches) continue filename = link.rsplit('/', 1)[-1] filename, ext = os.path.splitext(filename) ext = ext.__str__().replace(".", "").split('?')[0] option = {} option = option | new_post option["site_name"] = "OnlyFans" option["media_id"] = media_id option["filename"] = filename option["api_type"] = api_type option["media_type"] = media_type option["ext"] = ext option["username"] = username option["date_format"] = date_format option["text_length"] = text_length option["directory"] = download_path prepared_format = prepare_reformat(option) file_directory = main_helper.reformat( prepared_format, file_directory_format) prepared_format.directory = file_directory file_path = main_helper.reformat( prepared_format, filename_format) new_media["directory"] = os.path.join(file_directory) new_media["filename"] = os.path.basename(file_path) if file_directory not in directories: directories.append(file_directory) new_post["medias"].append(new_media) new_set["content"].append(new_post) new_set["directories"] = directories return new_set
async def fix_directories2( post: api_table, media_db: list[template_media_table], all_files: list[Path] ): delete_rows = [] final_api_type = ( os.path.join("Archived", api_type) if post.archived else api_type ) post_id = post.post_id media_db = [x for x in media_db if x.post_id == post_id] for media in media_db: media_id = media.media_id if media.link: url_path = urlparse.urlparse(media.link).path url_path = Path(url_path) else: url_path = Path(media.filename) new_filename = url_path.name original_filename, ext = (url_path.stem, url_path.suffix) ext = ext.replace(".", "") file_directory_format = site_settings.file_directory_format filename_format = site_settings.filename_format date_format = site_settings.date_format text_length = site_settings.text_length download_path = subscription.directory_manager.root_download_directory option = {} option["site_name"] = api.site_name option["post_id"] = post_id option["media_id"] = media_id option["profile_username"] = authed.username option["model_username"] = subscription.username option["api_type"] = final_api_type option["media_type"] = media.media_type option["filename"] = original_filename option["ext"] = ext option["text"] = post.text option["postedAt"] = media.created_at option["price"] = post.price option["date_format"] = date_format option["text_length"] = text_length option["directory"] = download_path option["preview"] = media.preview option["archived"] = post.archived prepared_format = prepare_reformat(option) file_directory = await prepared_format.reformat_2(file_directory_format) prepared_format.directory = file_directory old_filepath = "" if media.linked: filename_format = filename_format.with_name(f"linked_{filename_format}") new_filepath = await prepared_format.reformat_2(filename_format) old_filepaths = [ x for x in all_files if original_filename in x.name and x.parts != new_filepath.parts ] if not old_filepaths: old_filepaths = [x for x in all_files if str(media_id) in x.name] print if not media.linked: old_filepaths: list[Path] = [ x for x in old_filepaths if "linked_" not in x.parts ] if old_filepaths: old_filepath = old_filepaths[0] # a = randint(0,1) # await asyncio.sleep(a) if old_filepath and old_filepath != new_filepath: moved = None while not moved: try: if old_filepath.exists(): _old_filename, old_ext = (url_path.stem, url_path.suffix) if ".part" == old_ext: old_filepath.unlink() continue if media.size: media.downloaded = True found_dupes = [ x for x in media_db if x.filename == new_filename and x.id != media.id ] delete_rows.extend(found_dupes) os.makedirs(os.path.dirname(new_filepath), exist_ok=True) if media.linked: if os.path.dirname(old_filepath) == os.path.dirname( new_filepath ): moved = shutil.move(old_filepath, new_filepath) else: moved = shutil.copy(old_filepath, new_filepath) else: moved = shutil.move(old_filepath, new_filepath) else: break except OSError as e: print(traceback.format_exc()) print print if os.path.exists(new_filepath): if media.size: media.downloaded = True if prepared_format.text: pass media.directory = file_directory.as_posix() media.filename = os.path.basename(new_filepath) new_directories.append(os.path.dirname(new_filepath)) return delete_rows
async def media_scraper( self, post_result: Union[create_story, create_post, create_message], subscription: create_user, formatted_directory: Path, api_type: str, ): authed = subscription.get_authed() api = authed.api site_settings = api.get_site_settings() if not site_settings: return new_set: dict[str, Any] = {} new_set["content"] = [] directories: list[Path] = [] if api_type == "Stories": pass if api_type == "Archived": pass if api_type == "Posts": pass if api_type == "Products": pass if api_type == "Messages": pass download_path = formatted_directory model_username = subscription.username date_format = site_settings.date_format locations = self.media_types for media_type, alt_media_types in locations.__dict__.items(): date_today = datetime.now() master_date = datetime.strftime(date_today, "%d-%m-%Y %H:%M:%S") file_directory_format = site_settings.file_directory_format post_id = post_result.id new_post = {} new_post["medias"] = [] new_post["archived"] = False rawText = "" text = "" previews = [] date = None price = None if isinstance(post_result, create_story): date = post_result.createdAt if isinstance(post_result, create_post): if post_result.isReportedByMe: continue rawText = post_result.rawText text = post_result.text previews = post_result.preview date = post_result.postedAt price = post_result.price new_post["archived"] = post_result.isArchived if isinstance(post_result, create_product): if post_result.isReportedByMe: continue title = post_result.title rawText = post_result.rawText text = post_result.text previews = post_result.preview date = post_result.postedAt price = post_result.price new_post["title"] = title new_post["archived"] = post_result.isArchived if isinstance(post_result, create_message): if post_result.isReportedByMe: continue text = post_result.text previews = post_result.previews date = post_result.createdAt price = post_result.price if api_type == "Mass Messages": media_user = post_result.fromUser media_username = media_user.username if media_username != model_username: continue final_text = rawText if rawText else text if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime(master_date, "%d-%m-%Y %H:%M:%S") else: if not date: date = master_date if "T" in date: date_object = datetime.fromisoformat(date) else: date_object = datetime.strptime(date, "%d-%m-%Y %H:%M:%S") date_string = date_object.replace( tzinfo=None).strftime("%d-%m-%Y %H:%M:%S") master_date = date_string new_post["post_id"] = post_id new_post["user_id"] = subscription.id if isinstance(post_result, create_message): new_post["user_id"] = post_result.fromUser.id new_post["text"] = final_text new_post["postedAt"] = date_string new_post["paid"] = False new_post["preview_media_ids"] = previews new_post["api_type"] = api_type new_post["price"] = 0 if price is None: price = 0 if price: if all(media["canView"] for media in post_result.media): new_post["paid"] = True else: print new_post["price"] = price for media in post_result.media: media_id = media["id"] preview_link = "" link = await post_result.link_picker( media, site_settings.video_quality) matches = ["us", "uk", "ca", "ca2", "de"] if not link: continue url = urlparse(link) if not url.hostname: continue subdomain = url.hostname.split(".")[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split(".")[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_media: dict[str, Any] = dict() new_media["media_id"] = media_id new_media["links"] = [] new_media["media_type"] = media_type new_media["preview"] = False new_media["created_at"] = new_post["postedAt"] if isinstance(post_result, create_story): date_object = datetime.fromisoformat(media["createdAt"]) date_string = date_object.replace( tzinfo=None).strftime("%d-%m-%Y %H:%M:%S") new_media["created_at"] = date_string if int(media_id) in new_post["preview_media_ids"]: new_media["preview"] = True for xlink in link, preview_link: if xlink: new_media["links"].append(xlink) break if media["type"] not in alt_media_types: continue matches = [ s for s in site_settings.ignored_keywords if s in final_text ] if matches: print("Ignoring - ", f"PostID: {post_id}") continue filename = link.rsplit("/", 1)[-1] filename, ext = os.path.splitext(filename) ext = ext.__str__().replace(".", "").split("?")[0] final_api_type = (os.path.join("Archived", api_type) if new_post["archived"] else api_type) option: dict[str, Any] = {} option = option | new_post option["site_name"] = api.site_name option["media_id"] = media_id option["filename"] = filename option["api_type"] = final_api_type option["media_type"] = media_type option["ext"] = ext option["profile_username"] = authed.username option["model_username"] = model_username option["date_format"] = date_format option["postedAt"] = new_media["created_at"] option["text_length"] = site_settings.text_length option["directory"] = download_path option["preview"] = new_media["preview"] option["archived"] = new_post["archived"] prepared_format = prepare_reformat(option) file_directory = await prepared_format.reformat_2( file_directory_format) prepared_format.directory = file_directory file_path = await prepared_format.reformat_2( site_settings.filename_format) new_media["directory"] = os.path.join(file_directory) new_media["filename"] = os.path.basename(file_path) if file_directory not in directories: directories.append(file_directory) new_media["linked"] = None for k, v in subscription.temp_scraped: if k == api_type: continue if k == "Archived": v = getattr(v, api_type, []) if v: for post in v: found_medias = [] medias = post.media if medias: for temp_media in medias: temp_filename = temp_media.get("filename") if temp_filename: if temp_filename == new_media[ "filename"]: found_medias.append(temp_media) else: continue # found_medias = [x for x in medias # if x["filename"] == new_media["filename"]] if found_medias: for found_media in found_medias: found_media["linked"] = api_type new_media["linked"] = post["api_type"] new_media[ "filename"] = f"linked_{new_media['filename']}" print print print print new_post["medias"].append(new_media) found_post = [ x for x in new_set["content"] if x["post_id"] == post_id ] if found_post: found_post = found_post[0] found_post["medias"] += new_post["medias"] else: new_set["content"].append(new_post) new_set["directories"] = directories return new_set
def fix_directories(post: api_table, media_db: list[media_table]): delete_rows = [] final_type = "" if parent_type: final_type = f"{api_type}{os.path.sep}{parent_type}" final_type = final_type if final_type else api_type post_id = post.post_id media_db = [x for x in media_db if x.post_id == post_id] for media in media_db: media_id = media.media_id if media.link: path = urlparse.urlparse(media.link).path else: path: str = media.filename new_filename = os.path.basename(path) original_filename, ext = os.path.splitext(new_filename) ext = ext.replace(".", "") file_directory_format = json_settings["file_directory_format"] filename_format = json_settings["filename_format"] date_format = json_settings["date_format"] text_length = json_settings["text_length"] download_path = base_directory today = datetime.today() today = today.strftime("%d-%m-%Y %H:%M:%S") option = {} option["site_name"] = site_name option["post_id"] = post_id option["media_id"] = media_id option["username"] = username option["api_type"] = final_type if parent_type else api_type option["media_type"] = media.media_type option["filename"] = original_filename option["ext"] = ext option["text"] = post.text option["postedAt"] = media.created_at option["price"] = post.price option["date_format"] = date_format option["text_length"] = text_length option["directory"] = download_path option["preview"] = media.preview prepared_format = prepare_reformat(option) file_directory = main_helper.reformat( prepared_format, file_directory_format) prepared_format.directory = file_directory old_filepath = "" if media.linked: filename_format = f"linked_{filename_format}" old_filepaths = [ x for x in all_files if original_filename in os.path.basename(x)] if not old_filepaths: old_filepaths = [ x for x in all_files if str(media_id) in os.path.basename(x)] print if not media.linked: old_filepaths = [x for x in old_filepaths if "linked_" not in x] if old_filepaths: old_filepath = old_filepaths[0] new_filepath = main_helper.reformat( prepared_format, filename_format) if old_filepath and old_filepath != new_filepath: if os.path.exists(new_filepath): os.remove(new_filepath) moved = None while not moved: try: if os.path.exists(old_filepath): if media.size: media.downloaded = True found_dupes = [ x for x in media_db if x.filename == new_filename and x.id != media.id] delete_rows.extend(found_dupes) os.makedirs(os.path.dirname( new_filepath), exist_ok=True) if media.linked: if os.path.dirname(old_filepath) == os.path.dirname(new_filepath): moved = shutil.move(old_filepath, new_filepath) else: moved = shutil.copy(old_filepath, new_filepath) else: moved = shutil.move(old_filepath, new_filepath) else: break except OSError as e: print(traceback.format_exc()) print print if os.path.exists(new_filepath): if media.size: media.downloaded = True if prepared_format.text: pass media.directory = file_directory media.filename = os.path.basename(new_filepath) new_directories.append(os.path.dirname(new_filepath)) return delete_rows
def media_scraper(results, api, formatted_directories, username, api_type, parent_type=""): media_set = {} directories = [] session = api.sessions[0] if api_type == "Stories": if "stories" in results: items = results["stories"] for item in items: item["text"] = results["title"] results = results["stories"] if api_type == "Archived": print pass if api_type == "Posts": print if api_type == "Messages": pass if not results or "error" in results: return media_set if "result" in results: session = results["session"] results = results["result"] if "error" in results: return media_set download_path = formatted_directories["download_directory"] for location in formatted_directories["locations"]: sorted_directories = copy.copy(location["sorted_directories"]) master_date = "01-01-0001 00:00:00" media_type = location["media_type"] alt_media_type = location["alt_media_type"] file_directory_format = json_settings["file_directory_format"] if api_type == "Archived": x = file_directory_format.split(os.sep) for y in x: substr = "{api_type}" if substr == y: new_path = os.path.join(substr, parent_type) file_directory_format = file_directory_format.replace( substr, new_path) break print print seperator = " | " print( f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.") media_set2 = {} media_set2["valid"] = [] media_set2["invalid"] = [] for media_api in results: # if media_api["responseType"] == "post": # if media_api["isArchived"]: # pass if api_type == "Messages": media_api["rawText"] = media_api["text"] if api_type == "Mass Messages": media_user = media_api["fromUser"] media_username = media_user["username"] if media_username != username: continue date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"] if date == "-001-11-30T00:00:00+00:00": date_string = master_date date_object = datetime.strptime( master_date, "%d-%m-%Y %H:%M:%S") else: date_object = datetime.fromisoformat(date) date_string = date_object.replace(tzinfo=None).strftime( "%d-%m-%Y %H:%M:%S") master_date = date_string if not media_api["media"] and "rawText" in media_api: if media_type == "Texts": new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["text"] = media_api["rawText"] new_dict["postedAt"] = date_string media_set2["valid"].append(new_dict) print print for media in media_api["media"]: date = "-001-11-30T00:00:00+00:00" size = 0 link = "" if "source" in media: source = media["source"] link = source["source"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 if "src" in media: link = media["src"] size = media["info"]["preview"]["size"] if "info" in media_api else 1 date = media_api["createdAt"] if not link: continue matches = ["us", "uk", "ca", "ca2", "de"] url = urlparse(link) subdomain = url.hostname.split('.')[0] preview_link = media["preview"] if any(subdomain in nm for nm in matches): subdomain = url.hostname.split('.')[1] if "upload" in subdomain: continue if "convert" in subdomain: link = preview_link rules = [link == "", preview_link == ""] if all(rules): continue new_dict = dict() new_dict["post_id"] = media_api["id"] new_dict["media_id"] = media["id"] new_dict["links"] = [] for xlink in link, preview_link: if xlink: new_dict["links"].append(xlink) break new_dict["price"] = media_api["price"]if "price" in media_api else None if media["type"] not in alt_media_type: continue if "rawText" not in media_api: media_api["rawText"] = "" text = media_api["rawText"] if media_api["rawText"] else "" matches = [s for s in ignored_keywords if s in text] if matches: print("Matches: ", matches) continue new_dict["postedAt"] = date_string post_id = new_dict["post_id"] media_id = new_dict["media_id"] filename = link.rsplit('/', 1)[-1] filename, ext = os.path.splitext(filename) ext = ext.__str__().replace(".", "").split('?')[0] price = new_dict["price"] new_dict["text"] = text option = {} option = option | new_dict option["site_name"] = "OnlyFans" option["filename"] = filename option["api_type"] = api_type option["media_type"] = media_type option["ext"] = ext option["username"] = username option["date_format"] = date_format option["maximum_length"] = maximum_length option["directory"] = download_path prepared_format = prepare_reformat(option) file_directory = main_helper.reformat( prepared_format, file_directory_format) prepared_format.directory = file_directory file_path = main_helper.reformat( prepared_format, filename_format) new_dict["directory"] = os.path.join(file_directory) new_dict["filename"] = os.path.basename(file_path) new_dict["session"] = session if size == 0: media_set2["invalid"].append(new_dict) continue if file_directory not in directories: directories.append(file_directory) media_set2["valid"].append(new_dict) if media_set2["valid"] or media_set2["invalid"]: media_set[media_type] = media_set2 else: print media_set["directories"] = directories return media_set
async def fix_directories2(post: api_table, media_db: list[template_media_table]): delete_rows = [] final_api_type = (os.path.join("Archived", api_type) if post.archived else api_type) post_id = post.post_id media_db = [x for x in media_db if x.post_id == post_id] for media in media_db: media_id = media.media_id if media.link: path = urlparse.urlparse(media.link).path else: path: str = media.filename new_filename = os.path.basename(path) original_filename, ext = os.path.splitext(new_filename) ext = ext.replace(".", "") file_directory_format = json_settings["file_directory_format"] filename_format = json_settings["filename_format"] date_format = json_settings["date_format"] text_length = json_settings["text_length"] download_path = base_directory option = {} option["site_name"] = site_name option["post_id"] = post_id option["media_id"] = media_id option["profile_username"] = subscription.subscriber.username option["model_username"] = subscription.username option["api_type"] = final_api_type option["media_type"] = media.media_type option["filename"] = original_filename option["ext"] = ext option["text"] = post.text option["postedAt"] = media.created_at option["price"] = post.price option["date_format"] = date_format option["text_length"] = text_length option["directory"] = download_path option["preview"] = media.preview option["archived"] = post.archived prepared_format = prepare_reformat(option) file_directory = await main_helper.reformat( prepared_format, file_directory_format) prepared_format.directory = file_directory old_filepath = "" if media.linked: filename_format = f"linked_{filename_format}" old_filepaths = [ x for x in all_files if original_filename in os.path.basename(x) ] if not old_filepaths: old_filepaths = [ x for x in all_files if str(media_id) in os.path.basename(x) ] print if not media.linked: old_filepaths = [ x for x in old_filepaths if "linked_" not in x ] if old_filepaths: old_filepath = old_filepaths[0] # a = randint(0,1) # await asyncio.sleep(a) new_filepath = await main_helper.reformat(prepared_format, filename_format) if old_filepath and old_filepath != new_filepath: if os.path.exists(new_filepath): os.remove(new_filepath) moved = None while not moved: try: if os.path.exists(old_filepath): if media.size: media.downloaded = True found_dupes = [ x for x in media_db if x.filename == new_filename and x.id != media.id ] delete_rows.extend(found_dupes) os.makedirs(os.path.dirname(new_filepath), exist_ok=True) if media.linked: if os.path.dirname( old_filepath) == os.path.dirname( new_filepath): moved = shutil.move( old_filepath, new_filepath) else: moved = shutil.copy( old_filepath, new_filepath) else: moved = shutil.move(old_filepath, new_filepath) else: break except OSError as e: print(traceback.format_exc()) print print if os.path.exists(new_filepath): if media.size: media.downloaded = True if prepared_format.text: pass media.directory = file_directory media.filename = os.path.basename(new_filepath) new_directories.append(os.path.dirname(new_filepath)) return delete_rows
def fix_directories(post): final_type = "" if parent_type: final_type = f"{api_type}{os.path.sep}{parent_type}" print final_type = final_type if final_type else api_type database_session = Session() post_id = post.id result = database_session.query(folder.media_table) media_db = result.filter_by(post_id=post_id).all() for media in media_db: if media.link: path = urlparse.urlparse(media.link).path else: path = media.filename new_filename = os.path.basename(path) filename, ext = os.path.splitext(new_filename) ext = ext.replace(".", "") file_directory_format = json_settings["file_directory_format"] filename_format = json_settings["filename_format"] date_format = json_settings["date_format"] text_length = json_settings["text_length"] download_path = base_directory today = datetime.today() today = today.strftime("%d-%m-%Y %H:%M:%S") option = {} option["site_name"] = site_name option["post_id"] = post_id option["media_id"] = media.id option["username"] = username option["api_type"] = final_type if parent_type else api_type option["media_type"] = media.media_type option["filename"] = filename option["ext"] = ext option["text"] = post.text option["postedAt"] = media.created_at option["price"] = post.price option["date_format"] = date_format option["text_length"] = text_length option["directory"] = download_path prepared_format = prepare_reformat(option) file_directory = main_helper.reformat(prepared_format, file_directory_format) prepared_format.directory = file_directory old_filepath = "" old_filepaths = [ x for x in all_files if media.filename in os.path.basename(x) ] if not old_filepaths: old_filepaths = [ x for x in all_files if str(media.id) in os.path.basename(x) ] print if old_filepaths: old_filepath = old_filepaths[0] print new_filepath = main_helper.reformat(prepared_format, filename_format) if old_filepath and old_filepath != new_filepath: if os.path.exists(new_filepath): os.remove(new_filepath) if os.path.exists(old_filepath): if media.size: media.downloaded = True moved = None while not moved: try: moved = shutil.move(old_filepath, new_filepath) except OSError as e: print(traceback.format_exc()) print print else: print if prepared_format.text: pass media.directory = file_directory media.filename = os.path.basename(new_filepath) database_session.commit() new_directories.append(os.path.dirname(new_filepath)) database_session.close()