def import_posts(import_id, key, contributor_id, allowed_to_auto_import, key_id): setthreadtitle(f'KI{import_id}') jar = requests.cookies.RequestsCookieJar() jar.set('_session_id', key) try: mode_switched = enable_adult_mode(import_id, jar) fanclub_ids = get_paid_fanclubs(import_id, jar) except: log(import_id, "Error occurred during preflight. Stopping import.", 'exception') if (key_id): kill_key(key_id) return if (allowed_to_auto_import): try: encrypt_and_save_session_for_auto_import('fantia', jar['_session_id'], contributor_id=contributor_id) log(import_id, "Your key was successfully enrolled in auto-import!", to_client=True) except: log(import_id, "An error occured while saving your key for auto-import.", 'exception') if len(fanclub_ids) > 0: for fanclub_id in fanclub_ids: log(import_id, f'Importing fanclub {fanclub_id}', to_client=True) import_fanclub(fanclub_id, import_id, jar) else: log(import_id, "No paid subscriptions found. No posts will be imported.", to_client=True) if (mode_switched): disable_adult_mode(import_id, jar) log(import_id, "Finished scanning for posts.")
def import_posts(import_id, key, allowed_to_scrape_dms, contributor_id, allowed_to_auto_import, key_id): setthreadtitle(f'KI{import_id}') if (allowed_to_scrape_dms): log(import_id, "Importing DMs...", to_client=True) import_dms(key, import_id, contributor_id) log(import_id, "Done importing DMs.", to_client=True) campaign_ids = get_campaign_ids(key, import_id) current_user_campaign_id = get_current_user_campaign(key, import_id) if current_user_campaign_id: campaign_ids.append(current_user_campaign_id) if len(campaign_ids) > 0: for campaign_id in campaign_ids: log(import_id, f"Importing campaign {campaign_id}", to_client=True) import_campaign_page(posts_url + str(campaign_id), key, import_id, contributor_id=contributor_id, allowed_to_auto_import=allowed_to_auto_import, key_id=key_id) log(import_id, "Finished scanning for posts.") else: log(import_id, "No active subscriptions or invalid key. No posts will be imported.", to_client=True)
def import_posts(import_id, key, channel_ids_str, contributor_id, allowed_to_auto_import, key_id): setthreadtitle(f'KI{import_id}') test_key_for_auto_import(import_id, key, channel_ids_str, contributor_id, allowed_to_auto_import, key_id) channel_ids = channel_ids_str.split(',') if len(channel_ids) > 0: for channel_id in channel_ids: log(import_id, f"Importing channel {channel_id}", to_client=True) import_channel(channel_id, import_id, key) else: log(import_id, f"No channels has been supplied. No posts will be imported.", to_client=True)
def run(threads: List[Thread], limit=10): setthreadtitle('KTMASTER') pos = 0 threads_to_run = [] while pos < len(threads): for thread in threads_to_run: if not thread.is_alive(): threads_to_run.remove(thread) if not len(threads_to_run) > limit: # start and add more threads until it reaches the slot limit while len(threads_to_run) < limit: thread = threads[pos] thread.start() threads_to_run.append(thread) pos += 1
def get_subscribed_ids( import_id, key, contributor_id=None, allowed_to_auto_import=None, key_id=None, url='https://api.fanbox.cc/post.listSupporting?limit=50'): setthreadtitle(f'KI{import_id}') try: scraper = create_scrapper_session(useCloudscraper=False).get( url, cookies={'FANBOXSESSID': key}, headers={'origin': 'https://fanbox.cc'}, proxies=get_proxy()) scraper_data = scraper.json() scraper.raise_for_status() except requests.HTTPError as e: log( import_id, f'HTTP error when contacting Fanbox API ({url}). Stopping import.', 'exception') if (e.response.status_code == 401): if (key_id): kill_key(key_id) return set() except Exception: log(import_id, 'Error connecting to cloudscraper. Please try again.', 'exception') return set() user_ids = [] if scraper_data.get('body'): for post in scraper_data['body']['items']: user_ids.append(post['user']['userId']) campaign_ids = set() if len(user_ids) > 0: for id in user_ids: try: if id not in campaign_ids: campaign_ids.add(id) except Exception: log(import_id, "Error while retrieving one of the campaign ids", 'exception', True) continue log(import_id, 'Successfully gotten subscriptions') return campaign_ids
def init(ctx, repo, cmd, app_key, logplex_token, erlang_cookie, ip): try: setproctitle(repo) setthreadtitle(repo) if app_key == None: raise Exception("APP_KEY not found.") # get host index os.environ['HOST_INDEX'] = get_host_index(ctx.obj['host'], repo, app_key, os.environ['HOSTNAME']) start_ssh(repo, app_key) release = current_release(ctx.obj['host'], repo, app_key) slug_url = release["slug_url"] customer_app_name = release["customer_app_name"] persist_env(repo, customer_app_name, app_key, logplex_token, erlang_cookie, ip) download_file(slug_url, "/app/%s.tar.gz" % customer_app_name) extract_file('/app', '%s.tar.gz' % customer_app_name) maybe_start_epmd() def exec_fn(logplex_token, customer_app_name, repo, hostname): log_start_and_stop_web(logplex_token, repo, hostname) # should we load_profile for all commands even though .bashrc loads it already? # it's needed here beacuse we don't run init inisde of bash, but does it hurt to just # load it everywhere? load_profile() if is_distillery(customer_app_name): maybe_use_default_vm_args() ps = foreman_start(customer_app_name, cmd) pipe_to_log_shuttle(ps, cmd, logplex_token, repo, hostname) ps.wait() launch(ctx, exec_fn, repo, app_key, ip=ip, release=release) except Exception as e: log(logplex_token, repo, "-", str(e)) raise
def new_thread_init(self, *args, **kwargs): # pylint: disable=protected-access, disable=c-extension-no-member, disable=missing-function-docstring old_thread_init(self, *args, **kwargs) setproctitle.setthreadtitle(self._name)
#!/usr/bin/env python3.9 import os import sys import setproctitle #============================= # Some General Definitions #============================= NPB_VERSION = 'NPB3.3.1' PROGRAM_PATH = os.path.dirname(os.path.abspath(__file__)) setproctitle.setthreadtitle(os.path.basename(__file__)) VENV_PATH = os.path.normpath(f'{PROGRAM_PATH}/../../../venv/bin/activate') ENV_EXEC_PATH = os.path.normpath(f'{PROGRAM_PATH}/../../environment_scripts/environment.py') #============================= # Including Project Libs #============================= # for the purpose of run this script, after that this SHOULD BE MODIFIED. sys.path.append(f'{PROGRAM_PATH}/../../libs') import helper from helper import RemoteCommand from helper import FileType #============================= # Local Functions #=============================
def run(): setthreadtitle('KINDEXER') print('Indexer is starting!') while True: index_artists() time.sleep(300)
def new_thread_init(self, *args, **kwargs): old_thread_init(self, *args, **kwargs) setproctitle.setthreadtitle(self._name)
def import_posts(import_id, key, contributor_id, allowed_to_auto_import, key_id): setthreadtitle(f'KI{import_id}') jar = requests.cookies.RequestsCookieJar() jar.set('auth_token', key) try: scraper = create_scrapper_session(useCloudscraper=True).get( "https://subscribestar.adult/phd14517a.json", cookies=jar, proxies=get_proxy()) scraper_data = scraper.json()['html'] scraper.raise_for_status() except requests.HTTPError as exc: log( import_id, f'Status code {exc.response.status_code} when contacting SubscribeStar API.', 'exception') return if scraper_data == "": log( import_id, f"No active subscriptions or invalid key. No posts will be imported." ) return #break early as there's nothing anyway first_run = True dnp = get_all_dnp() post_ids_of_users = {} flagged_post_ids_of_users = {} while True: soup = BeautifulSoup(scraper_data, 'html.parser') posts = soup.find_all("div", {"class": "post"}) if (first_run and len(posts) == 0): if (key_id): kill_key(key_id) else: if (allowed_to_auto_import): try: encrypt_and_save_session_for_auto_import( 'subscribestar', key, contributor_id=contributor_id) log(import_id, f"Your key was successfully enrolled in auto-import!", to_client=True) except: log( import_id, f"An error occured while saving your key for auto-import.", 'exception') first_run = False for post in posts: try: post_id = post['data-id'] user_id = post.find("a", {"class": "post-avatar"})['href'].replace( '/', '') if "is-locked" in post.find("div", {"class": "post-body"})['class']: log( import_id, f"Skipping post {post_id} from user {user_id} as tier is too high" ) continue if len( list( filter( lambda artist: artist['id'] == user_id and artist['service'] == 'subscribestar', dnp))) > 0: log( import_id, f"Skipping post {post_id} from user {user_id} is in do not post list" ) continue # existence checking if not post_ids_of_users.get(user_id): post_ids_of_users[user_id] = get_all_artist_post_ids( 'subscribestar', user_id) if not flagged_post_ids_of_users.get(user_id): flagged_post_ids_of_users[ user_id] = get_all_artist_flagged_post_ids( 'subscribestar', user_id) if len( list( filter(lambda post: post['id'] == post_id, post_ids_of_users[user_id])) ) > 0 and len( list( filter(lambda flag: flag['id'] == post_id, flagged_post_ids_of_users[user_id]))) == 0: log(import_id, f'Skipping post {post_id} from user {user_id} because already exists', to_client=True) continue log(import_id, f"Starting import: {post_id}") #post_data = post.find("div", {"class": "trix-content"}) post_data = post.find("div", {"class": "post-content"}) # content = "" # for elem in post_data.recursiveChildGenerator(): # if isinstance(elem, str): # content += elem.strip() # elif elem.name == 'br': # content += '\n' stripped_content = strip_tags(post_data.text) date = post.find("div", {"class": "post-date"}).a.get_text() parsed_date = dateparser.parse( date.replace("DOPOLEDNE", "AM").replace( "ODPOLEDNE", "PM")) #Workaround for the Czeck langage post_model = { 'id': str(post_id), '"user"': user_id, 'service': 'subscribestar', 'title': (stripped_content[:60] + '..') if len(stripped_content) > 60 else stripped_content, 'content': str(post_data), 'embed': {}, 'shared_file': False, 'added': datetime.datetime.now(), 'published': parsed_date, 'edited': None, 'file': {}, 'attachments': [] } post_attachment_field = post.find("div", {"class": "uploads"}) if post_attachment_field: #if posts has any kind of attachement image_attachments = post_attachment_field.find( "div", {"class": "uploads-images"}) docs_attachments = post_attachment_field.find( "div", {"class": "uploads-docs"}) if image_attachments: for attachment in json.loads( image_attachments['data-gallery']): name = os.path.basename( urlparse(attachment['url']).path ) #gets the filename from the url #download the file reported_filename, hash_filename, _ = download_file( attachment['url'], 'subscribestar', user_id, str(post_id), name=name) #add it to the list post_model['attachments'].append({ 'name': reported_filename, 'path': hash_filename }) if docs_attachments: for attachment in docs_attachments.children: name = os.path.basename( urlparse(attachment.div.a['href']).path ) #gets the filename from the url #download the file reported_filename, hash_filename, = download_file( attachment.div.a['href'], 'subscribestar', user_id, str(post_id), name=name) #add it to the list post_model['attachments'].append({ 'name': reported_filename, 'path': hash_filename }) post_model['attachments'] = [ json.dumps(attach) for attach in post_model['attachments'] ] #add the post to DB post_model['embed'] = json.dumps(post_model['embed']) post_model['file'] = json.dumps(post_model['file']) columns = post_model.keys() data = ['%s'] * len(post_model.values()) data[-1] = '%s::jsonb[]' # attachments query = "INSERT INTO posts ({fields}) VALUES ({values}) ON CONFLICT (id, service) DO UPDATE SET {updates}".format( fields=','.join(columns), values=','.join(data), updates=','.join( [f'{column}=EXCLUDED.{column}' for column in columns])) conn = get_raw_conn() try: cursor3 = conn.cursor() cursor3.execute(query, list(post_model.values())) conn.commit() finally: return_conn(conn) update_artist('subscribestar', user_id) delete_post_flags('subscribestar', user_id, str(post_id)) if (config.ban_url): requests.request( 'BAN', f"{config.ban_url}/{post_model['service']}/user/" + post_model['"user"']) delete_artist_cache_keys('subscribestar', user_id) log(import_id, f"Finished importing {post_id} from user {user_id}", to_client=False) except Exception: log(import_id, f"Error while importing {post_id} from user {user_id}", 'exception') continue more = soup.find("div", {"class": "posts-more"}) if more: #we get the next HTML ready, and it'll process the new try: scraper = create_scrapper_session(useCloudscraper=True).get( "https://www.subscribestar.com" + more['href'], #the next page cookies=jar, proxies=get_proxy()) scraper_data = scraper.json()['html'] scraper.raise_for_status() except requests.HTTPError as exc: log( import_id, f'Status code {exc.response.status_code} when contacting SubscribeStar API.', 'exception') return else: #We got all the posts, exit log(import_id, f"Finished scanning for posts.") return
def import_posts(import_id, key, contributor_id=None, allowed_to_auto_import=None, key_id=None, offset=1): # noqa: C901 setthreadtitle(f'KI{import_id}') try: scraper = create_scrapper_session().get( "https://app.gumroad.com/library", cookies={'_gumroad_app_session': key}, proxies=get_proxy()) scraper_data = scraper.text scraper.raise_for_status() except requests.HTTPError: log( import_id, f'Status code {scraper_data.status_code} when contacting Gumroad API.', 'exception') return soup = BeautifulSoup(scraper_data, 'html.parser') gumroad_data = soup.select_one('[data-react-class=LibraryPage]') if not gumroad_data: log(import_id, "Can't log in; is your session key correct?") if (key_id): kill_key(key_id) return library_data = json.loads(gumroad_data['data-react-props']) if (allowed_to_auto_import): try: encrypt_and_save_session_for_auto_import( 'gumroad', key, contributor_id=contributor_id) log(import_id, "Your key was successfully enrolled in auto-import!", to_client=True) except: log(import_id, "An error occured while saving your key for auto-import.", 'exception') # users = {} # for user_info_list in scraper_data['creator_counts'].keys(): # parsed_user_info_list = json.loads(user_info_list) # (username, display name, ID), username can be null # users[parsed_user_info_list[1]] = parsed_user_info_list[2] dnp = get_all_dnp() post_ids_of_users = {} flagged_post_ids_of_users = {} for product in library_data['results']: try: post_id = None # get from data-permalink in element with id download-landing-page on download page user_id = product['product']['creator_id'] cover_url = None purchase_download_url = None # properties_element = product.find('div', {'data-react-class':'Product/LibraryCard'}) # react_props = json.loads(properties_element['data-react-props']) if not product.get('purchase'): log( import_id, f"Skipping post from user {user_id} because it has no purchase data" ) continue elif product['purchase']['is_archived']: # archived products may contain sensitive data such as a watermark with an e-mail on it log( import_id, f"Skipping post from user {user_id} because it is archived" ) continue # react_props_product = react_props['product'] title = product['product']['name'] # creator_name = product['product']['creator']['name'] purchase_download_url = product['purchase']['download_url'] scraper = create_scrapper_session().get( purchase_download_url, cookies={'_gumroad_app_session': key}, proxies=get_proxy()) scraper_data = scraper.text scraper_soup = BeautifulSoup(scraper_data, 'html.parser') post_id = scraper_soup.select_one( '[id=download-landing-page]')['data-permalink'] if len( list( filter( lambda artist: artist['id'] == user_id and artist[ 'service'] == 'gumroad', dnp))) > 0: log( import_id, f"Skipping post {post_id} from user {user_id} is in do not post list" ) continue # existence checking if not post_ids_of_users.get(user_id): post_ids_of_users[user_id] = get_all_artist_post_ids( 'gumroad', user_id) if not flagged_post_ids_of_users.get(user_id): flagged_post_ids_of_users[ user_id] = get_all_artist_flagged_post_ids( 'gumroad', user_id) if len( list( filter(lambda post: post['id'] == post_id, post_ids_of_users[user_id]))) > 0 and len( list( filter( lambda flag: flag['id'] == post_id, flagged_post_ids_of_users[user_id])) ) == 0: log(import_id, f'Skipping post {post_id} from user {user_id} because already exists', to_client=True) continue log(import_id, f"Starting import: {post_id} from user {user_id}") post_model = { 'id': post_id, '"user"': user_id, 'service': 'gumroad', 'title': title, 'content': '', 'embed': {}, 'shared_file': False, 'added': datetime.datetime.now(), 'published': None, 'edited': None, 'file': {}, 'attachments': [] } if 'main_cover_id' in product: main_cover_id = product['main_cover_id'] for cover in product['covers']: if cover['id'] == main_cover_id: cover_url = get_value(cover, 'original_url') or cover['url'] try: download_data = json.loads( scraper_soup.select_one( 'div[data-react-class="DownloadPage/FileList"]') ['data-react-props']) except: download_data = {"content_items": []} if cover_url: reported_filename, hash_filename, _ = download_file( cover_url, 'gumroad', user_id, post_id, ) post_model['file']['name'] = reported_filename post_model['file']['path'] = hash_filename for _file in download_data['content_items']: if (_file['type'] == 'file'): reported_filename, hash_filename, _ = download_file( 'https://gumroad.com' + _file['download_url'], 'gumroad', user_id, post_id, name= f'{_file["file_name"]}.{_file["extension"].lower()}', cookies={'_gumroad_app_session': key}) post_model['attachments'].append({ 'name': reported_filename, 'path': hash_filename }) else: log(import_id, f"Unsupported content found in product {post_id}. You should tell Shino about this.", to_client=True) log(import_id, json.dumps(_file), to_client=False) continue handle_post_import(post_model) update_artist('gumroad', user_id) delete_post_flags('gumroad', user_id, post_id) if (config.ban_url): requests.request( 'BAN', f"{config.ban_url}/{post_model['service']}/user/" + post_model['"user"']) delete_artist_cache_keys('gumroad', user_id) log(import_id, f"Finished importing post {post_id} from user {user_id}", to_client=False) except Exception: log(import_id, f"Error while importing {post_id} from user {user_id}", 'exception') continue
def test_setthreadtitle(): title = "setproctitle_test" # This is currently a no-op on Windows. Let's make sure # that at least it doesn't error out. setproctitle.setthreadtitle(title)
def run_paysite_import(import_id: str, key: str, contributor_id: str, random: Extended_Random = dev_random): """Runs the importer.""" setthreadtitle(f'Kitsune Import|{import_id}') dataset = generate_dataset(random) dms: List[DM] = [] users: List[User] = [] posts: List[Post] = [] comments: List[Comment] = [] if dataset['dms']: for dm in dataset['dms']: dm_model = DM(import_id=import_id, contributor_id=contributor_id, id=dm['id'], user=dm['user'], service=service_name, file={}, published=dm['published'], content=dm['content']) dms.append(dm_model) if dataset['users']: for user in dataset['users']: user_model = User(id=user['id'], name=user['name'], service=service_name) users.append(user_model) if user['posts']: for post in user['posts']: files: List[File] = [] file_item: File = None atttachments: List[File] = [] if post['files']: for file in post['files']: # file_model = download_file( # file_path=file['path'], # service=service_name, # user=user['id'], # post=post['id'], # file_name=file['name'] # ) # files.append(file_model) files.append(file) if files: file_item = files[0] else: file_item = {} if len(files) > 1: atttachments.extend(files[1:]) post_model = Post( id=post['id'], user=post['user'], service=service_name, file=file_item, attachments=[], published=post['published'], edited=post['edited'], shared_file=False, added=datetime.now(), title=post['title'], content=post['content'], embed={}, ) posts.append(post_model) if post['comments']: for comment in post['comments']: comment_model = Comment( id=comment['id'], post_id=post['id'], commenter=comment['commenter_id'], content=comment['content'], service=service_name, published=comment['published'], parent_id=comment['parent_id']) comments.append(comment_model) log(import_id, f'{len(dms)} DMs are going to be \"imported\"') import_dms(import_id, dms) log(import_id, f'{len(users)} artists are going to be \"imported\"') import_users(import_id, users) log(import_id, f'{len(posts)} posts are going to be \"imported\"') import_posts(import_id, posts) log(import_id, f'{len(comments)} comments are going to be \"imported\"') import_comments(import_id, comments) log(import_id, f"Finished the import \"{import_id}\" of service \"{service_name}\".") delete_keys([f'imports:{import_id}'])
def watch(queue_limit=config.pubsub_queue_limit): archiver_id = ''.join(random.choice(string.ascii_letters + string.digits) for x in range(16)) delete_keys_pattern([f"running_imports:*"]) setthreadtitle(f'KWATCHER') print(f'Key watcher ({archiver_id}) is starting!') redis = get_redis() threads_to_run = [] while True: for thread in threads_to_run: if not thread.is_alive(): threads_to_run.remove(thread) for key in scan_keys('imports:*'): key_data = redis.get(key) if key_data: import_id = key.split(':')[1] try: key_data = json.loads(key_data) except json.decoder.JSONDecodeError: print(f'An decoding error occured while processing import request {key.decode("utf-8")}; are you sending malformed JSON?') delete_keys([key]) continue if redis.get(f"running_imports:{archiver_id}:{import_id}"): continue if len(threads_to_run) < queue_limit: try: target = None args = None # data = { # 'key': key, # 'key_id': key_id, # 'service': service, # 'allowed_to_auto_import': allowed_to_auto_import, # 'allowed_to_save_session': allowed_to_save_session, # 'allowed_to_scrape_dms': allowed_to_scrape_dms, # 'channel_ids': channel_ids, # 'contributor_id': contributor_id # } service_key = key_data['key'] key_id = key_data.get('key_id', None) service = key_data['service'] allowed_to_auto_import = key_data.get('auto_import', False) allowed_to_save_session = key_data.get('save_session_key', False) allowed_to_scrape_dms = key_data.get('save_dms', False) channel_ids = key_data.get('channel_ids') contributor_id = key_data.get('contributor_id') if service_key and service and allowed_to_save_session: try: encrypt_and_log_session(import_id, service, service_key) except: logger.log(import_id, 'Exception occured while logging session.', 'exception', to_client=False) if service == 'patreon': target = patreon.import_posts args = (service_key, allowed_to_scrape_dms, contributor_id, allowed_to_auto_import, key_id) elif service == 'fanbox': target = fanbox.import_posts args = (service_key, contributor_id, allowed_to_auto_import, key_id) elif service == 'subscribestar': target = subscribestar.import_posts args = (service_key, contributor_id, allowed_to_auto_import, key_id) elif service == 'gumroad': target = gumroad.import_posts args = (service_key, contributor_id, allowed_to_auto_import, key_id) elif service == 'fantia': target = fantia.import_posts args = (service_key, contributor_id, allowed_to_auto_import, key_id) elif service == 'discord': target = discord.import_posts if channel_ids is None: channel_ids = '' args = (service_key, channel_ids.strip().replace(" ", ""), contributor_id, allowed_to_auto_import, key_id) else: logger.log(import_id, f'Service "{service}" unsupported.') delete_keys([key]) continue if target is not None and args is not None: logger.log(import_id, f'Starting import. Your import id is {import_id}.') thread = Thread(target=import_posts, args=(import_id, target, args)) thread.start() threads_to_run.append(thread) redis.set(f"running_imports:{archiver_id}:{import_id}", '1') else: logger.log(import_id, f'Error starting import. Your import id is {import_id}.') except KeyError: logger.log(import_id, 'Exception occured while starting import due to missing data in payload.', 'exception', to_client=True) delete_keys([key]) time.sleep(1)