def start_datascraper(): parser = ArgumentParser() parser.add_argument("-m", "--metadata", action='store_true', help="only exports metadata") args = parser.parse_args() if args.metadata: print("Exporting Metadata Only") log_error = main_helper.setup_logger('errors', 'errors.log') console = logging.StreamHandler() console.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(name)s %(message)s') console.setFormatter(formatter) logging.getLogger("").addHandler(console) # root = os.getcwd() config_path = os.path.join('.settings', 'config.json') json_config, json_config2 = main_helper.get_config(config_path) json_settings = json_config["settings"] json_sites = json_config["supported"] infinite_loop = json_settings["infinite_loop"] global_user_agent = json_settings['global_user_agent'] domain = json_settings["auto_site_choice"] path = os.path.join('.settings', 'extra_auth.json') extra_auth_config = json.load(open(path)) exit_on_completion = json_settings['exit_on_completion'] loop_timeout = json_settings['loop_timeout'] string = "Site: " site_names = [] bl = ["patreon"] if not domain: site_count = len(json_sites) count = 0 for x in json_sites: if x in bl: continue string += str(count) + " = " + x site_names.append(x) if count + 1 != site_count: string += " | " count += 1 string += "x = Exit" try: while True: if domain: site_name = domain else: print(string) x = input() if x == "x": break x = int(x) site_name = site_names[x] site_name_lower = site_name.lower() json_auth_array = [json_sites[site_name_lower]["auth"]] json_site_settings = json_sites[site_name_lower]["settings"] auto_scrape_names = json_site_settings["auto_scrape_names"] extra_auth_settings = json_sites[site_name_lower][ "extra_auth_settings"] if "extra_auth_settings" in json_sites[ site_name_lower] else { "extra_auth": False } extra_auth = extra_auth_settings["extra_auth"] if extra_auth: choose_auth = extra_auth_settings["choose_auth"] merge_auth = extra_auth_settings["merge_auth"] json_auth_array += extra_auth_config[site_name_lower][ "extra_auth"] if choose_auth: json_auth_array = main_helper.choose_auth(json_auth_array) session_array = [] x = onlyfans app_token = "" subscription_array = [] legacy = True if site_name_lower == "onlyfans": legacy = False site_name = "OnlyFans" subscription_array = [] auth_count = -1 x.assign_vars(json_config, json_site_settings, site_name) for json_auth in json_auth_array: auth_count += 1 app_token = json_auth['app_token'] user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] x = onlyfans session = x.create_session() if not session: print("Unable to create session") continue session = x.create_auth(session, user_agent, app_token, json_auth) session_array.append(session) if not session["session"]: continue # x.get_paid_posts(session["session"],app_token) cookies = session["session"].cookies.get_dict() auth_id = cookies["auth_id"] json_auth['auth_id'] = auth_id json_auth['auth_uniq_'] = cookies["auth_uniq_" + auth_id] json_auth['auth_hash'] = cookies["auth_hash"] json_auth['sess'] = cookies["sess"] json_auth['fp'] = cookies["fp"] if json_config != json_config2: update_config(json_config) me_api = session["me_api"] array = x.get_subscriptions(session["session"], app_token, session["subscriber_count"], me_api, auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") if site_name_lower == "patreon": legacy = False site_name = "Patreon" subscription_array = [] auth_count = -1 x = patreon x.assign_vars(json_config, json_site_settings, site_name) for json_auth in json_auth_array: auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] session = x.create_session() session = x.create_auth(session, user_agent, json_auth) session_array.append(session) if not session["session"]: continue cookies = session["session"].cookies.get_dict() json_auth['session_id'] = cookies["session_id"] if json_config != json_config2: update_config(json_config) me_api = session["me_api"] array = x.get_subscriptions(session["session"], auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") elif site_name_lower == "starsavn": legacy = False site_name = "StarsAVN" subscription_array = [] auth_count = -1 x = starsavn x.assign_vars(json_config, json_site_settings, site_name) for json_auth in json_auth_array: auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] sess = json_auth['sess'] auth_array = dict() auth_array["sess"] = sess session = x.create_session() session = x.create_auth(session, user_agent, app_token, json_auth) session_array.append(session) if not session["session"]: continue me_api = session["me_api"] array = x.get_subscriptions(session["session"], app_token, session["subscriber_count"], me_api, auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") elif site_name == "fourchan": x = fourchan site_name = "4Chan" x.assign_vars(json_config, json_site_settings, site_name) session_array = [x.create_session()] array = x.get_subscriptions() subscription_array = x.format_options(array) elif site_name == "bbwchan": x = bbwchan site_name = "BBWChan" x.assign_vars(json_config, json_site_settings, site_name) session_array = [x.create_session()] array = x.get_subscriptions() subscription_array = x.format_options(array) names = subscription_array[0] if names: print("Names: Username = username | " + subscription_array[1]) if not auto_scrape_names: value = input().strip() if value.isdigit(): if value == "0": names = names[1:] else: names = [names[int(value)]] else: names = [name for name in names if value in name[1]] else: value = 0 names = names[1:] else: print("There's nothing to scrape.") continue start_time = timeit.default_timer() download_list = [] for name in names: # Extra Auth Support if not legacy: json_auth = json_auth_array[name[0]] auth_count = name[0] session = session_array[auth_count]["session"] name = name[-1] else: session = session_array[0]["session"] main_helper.assign_vars(json_config) username = main_helper.parse_links(site_name_lower, name) result = x.start_datascraper(session, username, site_name, app_token, choice_type=value) if not args.metadata: download_list.append(result) for y in download_list: for arg in y[1]: x.download_media(*arg) stop_time = str(int(timeit.default_timer() - start_time) / 60) print('Task Completed in ' + stop_time + ' Minutes') if exit_on_completion: print("Now exiting.") exit(0) elif not infinite_loop: print("Input anything to continue") input() elif loop_timeout: print('Pausing scraper for ' + loop_timeout + ' seconds.') time.sleep(int(loop_timeout)) except Exception as e: log_error.exception(e) input()
from datetime import datetime from itertools import chain, groupby, product from multiprocessing.dummy import Pool as ThreadPool from urllib.parse import urlparse import requests from requests.adapters import HTTPAdapter import extras.OFSorter.ofsorter as ofsorter from helpers.main_helper import (check_for_dupe_file, clean_text, create_sign, export_archive, format_directory, format_image, format_media_set, get_directory, json_request, log_error, reformat, setup_logger) log_download = setup_logger('downloads', 'downloads.log') json_config = None multithreading = None json_settings = None auto_choice = None j_directory = None format_path = None overwrite_files = None proxy = None cert = None date_format = None ignored_keywords = None ignore_type = None export_metadata = None delete_legacy_metadata = None
from urllib.parse import urlparse import copy import json import jsonpickle from deepdiff import DeepHash import requests import helpers.main_helper as main_helper import classes.prepare_download as prepare_download from types import SimpleNamespace from helpers.main_helper import import_archive, export_archive multiprocessing = main_helper.multiprocessing log_download = main_helper.setup_logger('downloads', 'downloads.log') json_config = None json_global_settings = None max_threads = -1 json_settings = None auto_choice = None j_directory = "" metadata_directory_format = "" file_directory_format = None file_name_format = None overwrite_files = None date_format = None ignored_keywords = None ignore_type = None export_metadata = None
def start_datascraper(): parser = ArgumentParser() parser.add_argument("-m", "--metadata", action='store_true', help="only exports metadata") args = parser.parse_args() if args.metadata: print("Exporting Metadata Only") log_error = main_helper.setup_logger('errors', 'errors.log') console = logging.StreamHandler() console.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(name)s %(message)s') console.setFormatter(formatter) logging.getLogger("").addHandler(console) # Open config.json and fill in MANDATORY information for the script to work path = os.path.join('.settings', 'config.json') if os.path.isfile(path): json_config = json.load(open(path)) else: json_config = {} json_config2 = json.loads( json.dumps(make_config.start(**json_config), default=lambda o: o.__dict__)) if json_config != json_config2: update_config(json_config2) if not json_config: input( "The .settings\\config.json file has been created. Fill in whatever you need to fill in and then press enter when done.\n" ) json_config2 = json.load(open(path)) json_config = copy.deepcopy(json_config2) json_settings = json_config["settings"] json_sites = json_config["supported"] infinite_loop = json_settings["infinite_loop"] global_user_agent = json_settings['global_user_agent'] domain = json_settings["auto_site_choice"] path = os.path.join('.settings', 'extra_auth.json') extra_auth_config = json.load(open(path)) exit_on_completion = json_settings['exit_on_completion'] loop_timeout = json_settings['loop_timeout'] string = "" site_names = [] if not domain: site_count = len(json_sites) count = 0 for x in json_sites: string += str(count) + " = " + x site_names.append(x) if count + 1 != site_count: string += " | " count += 1 try: while True: if domain: site_name = domain else: print("Site: " + string) x = int(input()) site_name = site_names[x] site_name_lower = site_name.lower() json_auth_array = [json_sites[site_name_lower]["auth"]] json_site_settings = json_sites[site_name_lower]["settings"] auto_scrape_names = json_site_settings["auto_scrape_names"] extra_auth_settings = json_sites[site_name_lower][ "extra_auth_settings"] if "extra_auth_settings" in json_sites[ site_name_lower] else { "extra_auth": False } extra_auth = extra_auth_settings["extra_auth"] if extra_auth: choose_auth = extra_auth_settings["choose_auth"] merge_auth = extra_auth_settings["merge_auth"] json_auth_array += extra_auth_config[site_name_lower][ "extra_auth"] if choose_auth: json_auth_array = main_helper.choose_auth(json_auth_array) session_array = [] x = onlyfans app_token = "" subscription_array = [] legacy = True if site_name_lower == "onlyfans": legacy = False site_name = "OnlyFans" subscription_array = [] auth_count = -1 x.assign_vars(json_config, json_site_settings) for json_auth in json_auth_array: auth_count += 1 app_token = json_auth['app_token'] user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] x = onlyfans session = x.create_session() session = x.create_auth(session, user_agent, app_token, json_auth) session_array.append(session) if not session["session"]: continue cookies = session["session"].cookies.get_dict() json_auth['auth_id'] = cookies["auth_id"] json_auth['auth_hash'] = cookies["auth_hash"] json_auth['sess'] = cookies["sess"] json_auth['fp'] = cookies["fp"] if json_config != json_config2: update_config(json_config) me_api = session["me_api"] array = x.get_subscriptions(session["session"], app_token, session["subscriber_count"], me_api, auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") elif site_name_lower == "starsavn": legacy = False site_name = "StarsAVN" subscription_array = [] auth_count = -1 x = starsavn x.assign_vars(json_config, json_site_settings) for json_auth in json_auth_array: auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] sess = json_auth['sess'] auth_array = dict() auth_array["sess"] = sess session = x.create_session(user_agent, app_token, auth_array) session_array.append(session) if not session["session"]: continue me_api = session["me_api"] array = x.get_subscriptions(session["session"], app_token, session["subscriber_count"], me_api, auth_count) subscription_array += array subscription_array = x.format_options(subscription_array, "usernames") elif site_name == "fourchan": x = fourchan site_name = "4Chan" x.assign_vars(json_config, json_site_settings) session_array = [x.create_session()] array = x.get_subscriptions() subscription_array = x.format_options(array) elif site_name == "bbwchan": x = bbwchan site_name = "BBWChan" x.assign_vars(json_config, json_site_settings) session_array = [x.create_session()] array = x.get_subscriptions() subscription_array = x.format_options(array) names = subscription_array[0] if names: print("Names: " + subscription_array[1]) if not auto_scrape_names: value = int(input().strip()) else: value = 0 if value: names = [names[value]] else: names.pop(0) else: print("There's nothing to scrape.") continue start_time = timeit.default_timer() download_list = [] for name in names: # Extra Auth Support if not legacy: json_auth = json_auth_array[name[0]] auth_count = name[0] session = session_array[auth_count]["session"] name = name[1] else: session = session_array[0]["session"] main_helper.assign_vars(json_config) username = main_helper.parse_links(site_name_lower, name) result = x.start_datascraper(session, username, site_name, app_token, choice_type=value) if not args.metadata: download_list.append(result) for y in download_list: for arg in y[1]: x.download_media(*arg) stop_time = str(int(timeit.default_timer() - start_time) / 60) print('Task Completed in ' + stop_time + ' Minutes') if exit_on_completion: print("Now exiting.") exit(0) elif not infinite_loop: print("Input anything to continue") input() elif loop_timeout: print('Pausing scraper for ' + loop_timeout + ' seconds.') time.sleep(int(loop_timeout)) except Exception as e: log_error.exception(e) input()
def start_datascraper(): parser = ArgumentParser() parser.add_argument("-m", "--metadata", action='store_true', help="only exports metadata") args = parser.parse_args() if args.metadata: print("Exporting Metadata Only") log_error = main_helper.setup_logger('errors', 'errors.log') console = logging.StreamHandler() console.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(name)s %(message)s') console.setFormatter(formatter) logging.getLogger("").addHandler(console) # root = os.getcwd() config_path = os.path.join('.settings', 'config.json') json_config, json_config2 = main_helper.get_config(config_path) json_settings = json_config["settings"] json_sites = json_config["supported"] infinite_loop = json_settings["infinite_loop"] global_user_agent = json_settings['global_user_agent'] domain = json_settings["auto_site_choice"] path = os.path.join('.settings', 'extra_auth.json') # extra_auth_config, extra_auth_config2 = main_helper.get_config(path) extra_auth_config = {} exit_on_completion = json_settings['exit_on_completion'] loop_timeout = json_settings['loop_timeout'] main_helper.assign_vars(json_config) string, site_names = module_chooser(domain, json_sites) try: while True: if domain: if site_names: site_name = domain else: print(string) continue else: print(string) x = input() if x == "x": break x = int(x) site_name = site_names[x] site_name_lower = site_name.lower() json_auth_array = [json_sites[site_name_lower]["auth"]] json_site_settings = json_sites[site_name_lower]["settings"] auto_scrape_names = json_site_settings["auto_scrape_names"] extra_auth_settings = json_sites[site_name_lower][ "extra_auth_settings"] if "extra_auth_settings" in json_sites[ site_name_lower] else { "extra_auth": False } extra_auth = extra_auth_settings["extra_auth"] if extra_auth: choose_auth = extra_auth_settings["choose_auth"] merge_auth = extra_auth_settings["merge_auth"] json_auth_array += extra_auth_config["supported"][ site_name_lower]["auths"] if choose_auth: json_auth_array = main_helper.choose_auth(json_auth_array) apis = [] module = m_onlyfans subscription_array = [] legacy = True original_sessions = api_helper.create_session( settings=json_settings) if not original_sessions: print("Unable to create session") continue archive_time = timeit.default_timer() if site_name_lower == "onlyfans": site_name = "OnlyFans" subscription_array = [] auth_count = -1 jobs = json_site_settings["jobs"] for json_auth in json_auth_array: api = OnlyFans.start(original_sessions) auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] module = m_onlyfans module.assign_vars(json_auth, json_config, json_site_settings, site_name) api.set_auth_details(**json_auth, global_user_agent=user_agent) identifier = "" setup = module.account_setup(api, identifier=identifier) if not setup: continue if jobs["scrape_names"]: array = module.manage_subscriptions( api, auth_count, identifier=identifier) subscription_array += array apis.append(api) subscription_list = module.format_options( subscription_array, "usernames") if jobs["scrape_paid_content"]: print("Scraping Paid Content") paid_content = module.paid_content_scraper(apis) if jobs["scrape_names"]: print("Scraping Subscriptions") x = main_helper.process_names(module, subscription_list, auto_scrape_names, json_auth_array, apis, json_config, site_name_lower, site_name) x = main_helper.process_downloads(apis, module) print elif site_name_lower == "starsavn": site_name = "StarsAVN" subscription_array = [] auth_count = -1 for json_auth in json_auth_array: sessions = api_helper.copy_sessions(original_sessions) api = StarsAVN.start(sessions) auth_count += 1 user_agent = global_user_agent if not json_auth[ 'user_agent'] else json_auth['user_agent'] module = m_starsavn module.assign_vars(json_auth, json_config, json_site_settings, site_name) api.set_auth_details(**json_auth, global_user_agent=user_agent) setup = module.account_setup(api) if not setup: continue jobs = json_site_settings["jobs"] if jobs["scrape_names"]: array = module.manage_subscriptions(api, auth_count) subscription_array += array if jobs["scrape_paid_content"]: paid_contents = api.get_paid_content() paid_content = module.paid_content_scraper(api) apis.append(api) subscription_array = module.format_options( subscription_array, "usernames") stop_time = str(int(timeit.default_timer() - archive_time) / 60)[:4] print('Archive Completed in ' + stop_time + ' Minutes') if exit_on_completion: print("Now exiting.") exit(0) elif not infinite_loop: print("Input anything to continue") input() elif loop_timeout: print('Pausing scraper for ' + loop_timeout + ' seconds.') time.sleep(int(loop_timeout)) except Exception as e: log_error.exception(e) input()
#!/usr/bin/env python3 import tests.main_test as main_test import os import logging import time main_test.version_check() main_test.check_config() main_test.check_profiles() if __name__ == "__main__": import datascraper.main_datascraper as main_datascraper import helpers.main_helper as main_helper log_error = main_helper.setup_logger('errors', 'errors.log') console = logging.StreamHandler() console.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s %(levelname)s %(name)s %(message)s') console.setFormatter(formatter) logging.getLogger("").addHandler(console) config_path = os.path.join('.settings', 'config.json') json_config, json_config2 = main_helper.get_config(config_path) json_settings = json_config["settings"] exit_on_completion = json_settings['exit_on_completion'] infinite_loop = json_settings["infinite_loop"] loop_timeout = json_settings['loop_timeout'] json_sites = json_config["supported"] domain = json_settings["auto_site_choice"] string, site_names = main_helper.module_chooser(domain, json_sites) while True: try:
import requests from helpers.main_helper import get_directory, json_request, reformat, format_directory, format_media_set, export_archive, format_image, check_for_dupe_file, setup_logger import os import json from itertools import count, product from itertools import chain import multiprocessing from multiprocessing.dummy import Pool as ThreadPool from datetime import datetime import logging import math from random import randrange log_download = setup_logger('downloads', 'downloads.log') log_error = setup_logger('errors', 'errors.log') # Open config.json and fill in OPTIONAL information path = os.path.join('.settings', 'config.json') json_config = json.load(open(path)) json_global_settings = json_config["settings"] multithreading = json_global_settings["multithreading"] json_settings = json_config["supported"]["stars_avn"]["settings"] auto_choice = json_settings["auto_choice"] j_directory = get_directory(json_settings['directory']) format_path = json_settings['file_name_format'] overwrite_files = json_settings["overwrite_files"] date_format = json_settings["date_format"] ignored_keywords = json_settings["ignored_keywords"] ignore_unfollowed_accounts = json_settings["ignore_unfollowed_accounts"] export_metadata = json_settings["export_metadata"]