def get_imageboard(search): if 'imageboard' in search: chan = search["imageboard"] # will raise error if nor supported imageboard_info.imageboard_info(chan) else: # default chan = "4chan" return chan
def get_imageboard(self, search): if 'imageboard' in search: chan = search["imageboard"] # will raise error if not supported imageboard_info.imageboard_info(chan) else: # default chan = "4chan" return chan
def get_imageboard(self, search: dict): """ get imageboard from a search Returns: imageboard_info object of an imageboard """ if 'imageboard' in search: chan = search["imageboard"] # will raise error if not supported imageboard_info.imageboard_info(chan) else: # default chan = "4chan" return chan
def get_catalog_json(board, chan): chan_base_url = imageboard_info.imageboard_info(chan).base_url catalog = urllib.request.urlopen("{0}{1}/catalog.json".format( chan_base_url, board)) try: catalog_data = catalog.read() except http.client.IncompleteRead as err: catalog_data = err.partial return json.loads(catalog_data.decode("utf8"))
def __init__(self, thread_nb:int, board:str, imageboard:str, output_folder:str, folder:str, is_quiet:bool, condition:dict, check_duplicate:bool, tags:list, throttle:int, logger, single_run=False): """ class used for downloading a thread. Can be started after initialization by calling it's download() function. Args: thread_nb: the thread number of an imageboard thread. Ex: 809293 board: The board where the thread exist. Ex: 'g' for the 4chan technology board (http://boards.4channel.org/g/) imageboard: The imageboard where the thread exist. Ex: 4chan output_folder: directory where the pictures will be downloaded. Ex: /tmp/4scanner_img folder: an optional directory name that can be specified for sorting image in the output_folder. Ex: pictures_of_computers is_quiet: suppresses all logging. condition: dict used when deciding which pictures to download. Ex: {"width": "=1920", "height": "=1080"} check_duplicate: Avoid downloading duplicate that were already downloaded. tags: this list of tags will be added a file called $PICTURE_NAME.txt for every pictures to help importing pictures to hydrus network throttle: Time to wait, in second, between image downloads logger: The logger to use with the class single_run: Run the download loop only once, use if you don't want to wait for a thread to 404 before exiting. """ # Getting info about the imageboard URL ib_info = imageboard_info.imageboard_info(imageboard) base_url = ib_info.base_url image_url = ib_info.image_base_url thread_subfolder = ib_info.thread_subfolder image_subfolder = ib_info.image_subfolder # These URL are the url of the thread # and the base url where images are stored on the imageboard self.thread_url = "{0}{1}{2}{3}.json".format(base_url, board, thread_subfolder, thread_nb) self.image_url = "{0}{1}{2}".format(image_url, board, image_subfolder) self.tmp_dir = "/tmp/{0}/".format(os.getpid()) self.curr_time = time.strftime('%d%m%Y-%H%M%S') self.pid = os.getpid() self.thread = threading.current_thread().name self.downloaded_log = "{0}/{1}4scanner_dld-{2}-{3}".format(self.tmp_dir, self.curr_time, self.pid, self.thread) self.out_dir = os.path.join(output_folder, 'downloads') self.thread_nb = thread_nb self.imageboard = imageboard self.board = board self.condition = condition self.check_duplicate = check_duplicate self.is_quiet = is_quiet self.tags = tags self.throttle = int(throttle) # Creating the tmp and output directory os.makedirs(self.tmp_dir, exist_ok=True) os.makedirs(self.out_dir, exist_ok=True) self.single_run = single_run self.logger = logger
def get_catalog_json(self, board, chan): chan_base_url = imageboard_info.imageboard_info(chan).base_url catalog = urllib.request.urlopen( "{0}{1}/catalog.json".format(chan_base_url, board)) try: catalog_data = catalog.read() except http.client.IncompleteRead as err: catalog_data = err.partial return json.loads(catalog_data.decode("utf8"))
def get_catalog_json(self, board: str, chan: str): """ Get the catalog of a given imageboards board as a JSON Return: catalog info as a dict """ chan_base_url = imageboard_info.imageboard_info(chan).base_url catalog = urllib.request.urlopen("{0}{1}/catalog.json".format( chan_base_url, board)) try: catalog_data = catalog.read() except http.client.IncompleteRead as err: catalog_data = err.partial return json.loads(catalog_data.decode("utf8"))
def __init__(self, thread_nb, board, imageboard, output_folder, folder, is_quiet, condition, check_duplicate, tag_list, logger): # Getting info about the imageboard URL ib_info = imageboard_info.imageboard_info(imageboard) base_url = ib_info.base_url image_url = ib_info.image_base_url thread_subfolder = ib_info.thread_subfolder image_subfolder = ib_info.image_subfolder # These URL are the url of the thread # and the base url where images are stored on the imageboard self.thread_url = "{0}{1}{2}{3}.json".format(base_url, board, thread_subfolder, thread_nb) self.image_url = "{0}{1}{2}".format(image_url, board, image_subfolder) self.tmp_dir = "/tmp/{0}/".format(os.getpid()) self.curr_time = time.strftime('%d%m%Y-%H%M%S') self.pid = os.getpid() self.thread = threading.current_thread().name self.downloaded_log = "{0}/{1}4scanner_dld-{2}-{3}".format( self.tmp_dir, self.curr_time, self.pid, self.thread) self.out_dir = os.path.join(output_folder, 'downloads', imageboard, board, folder, str(thread_nb)) self.thread_nb = thread_nb self.imageboard = imageboard self.board = board self.condition = condition self.check_duplicate = check_duplicate self.is_quiet = is_quiet self.tag_list = tag_list # Creating the tmp and output directory self.create_dir(self.tmp_dir) self.create_dir(self.out_dir) self.logger = logger
def __init__(self, thread_nb, board, imageboard, output_folder, folder, is_quiet, condition, check_duplicate, tag_list, logger): # Getting info about the imageboard URL ib_info = imageboard_info.imageboard_info(imageboard) base_url = ib_info.base_url image_url = ib_info.image_base_url thread_subfolder = ib_info.thread_subfolder image_subfolder = ib_info.image_subfolder # These URL are the url of the thread # and the base url where images are stored on the imageboard self.thread_url = "{0}{1}{2}{3}.json".format(base_url, board, thread_subfolder, thread_nb) self.image_url = "{0}{1}{2}".format(image_url, board, image_subfolder) self.tmp_dir = "/tmp/{0}/".format(os.getpid()) self.curr_time = time.strftime('%d%m%Y-%H%M%S') self.pid = os.getpid() self.thread = threading.current_thread().name self.downloaded_log = "{0}/{1}4scanner_dld-{2}-{3}".format(self.tmp_dir, self.curr_time, self.pid, self.thread) self.out_dir = os.path.join(output_folder, 'downloads', imageboard, board, folder, str(thread_nb)) self.thread_nb = thread_nb self.imageboard = imageboard self.board = board self.condition = condition self.check_duplicate = check_duplicate self.is_quiet = is_quiet self.tag_list = tag_list # Creating the tmp and output directory self.create_dir(self.tmp_dir) self.create_dir(self.out_dir) self.logger = logger
print("!!! dupecheck.add_to_db not tested yet !!! -") print("--------------------------------------------------------") print("--------------------------------------------------------") print("!!! dupecheck.is_duplicate not tested yet !!! -") print("--------------------------------------------------------") print('\x1b[6;30;42m' + 'All test OK for dupecheck.py' + '\x1b[0m') print("Testing imageboard_info.py") print("--------------------------------------------------------") print("Testing: imageboard_info.get_imageboard_info -") print("--------------------------------------------------------") info_4chan = imageboard_info.imageboard_info("4chan") if info_4chan.base_url != "http://a.4cdn.org/": print("chan_base_url wrong for 4chan") exit(1) if info_4chan.thread_subfolder != "/thread/": print("chan_thread_subfolder wrong for 4chan") exit(1) if info_4chan.image_subfolder != "/": print("chan_image_subfolder wrong for 4chan") exit(1) if info_4chan.image_base_url != "http://i.4cdn.org/": print("chan_image_base_url wrong for 4chan") exit(1)