Exemple #1
0
def get_imageboard(search):
    if 'imageboard' in search:
        chan = search["imageboard"]
        # will raise error if nor supported
        imageboard_info.imageboard_info(chan)
    else:
        # default
        chan = "4chan"

    return chan
Exemple #2
0
    def get_imageboard(self, search):
        if 'imageboard' in search:
            chan = search["imageboard"]
            # will raise error if not supported
            imageboard_info.imageboard_info(chan)
        else:
            # default
            chan = "4chan"

        return chan
Exemple #3
0
    def get_imageboard(self, search: dict):
        """
        get imageboard from a search

        Returns:
            imageboard_info object of an imageboard
        """

        if 'imageboard' in search:
            chan = search["imageboard"]
            # will raise error if not supported
            imageboard_info.imageboard_info(chan)
        else:
            # default
            chan = "4chan"

        return chan
Exemple #4
0
def get_catalog_json(board, chan):
    chan_base_url = imageboard_info.imageboard_info(chan).base_url
    catalog = urllib.request.urlopen("{0}{1}/catalog.json".format(
        chan_base_url, board))
    try:
        catalog_data = catalog.read()
    except http.client.IncompleteRead as err:
        catalog_data = err.partial
    return json.loads(catalog_data.decode("utf8"))
Exemple #5
0
    def __init__(self, thread_nb:int, board:str, imageboard:str, output_folder:str, folder:str, is_quiet:bool, condition:dict, check_duplicate:bool, tags:list, throttle:int, logger, single_run=False):
        """
        class used for downloading a thread. Can be started after initialization by calling it's download() function.

        Args:
            thread_nb: the thread number of an imageboard thread. Ex: 809293
            board: The board where the thread exist. Ex: 'g' for the 4chan technology board (http://boards.4channel.org/g/)
            imageboard: The imageboard where the thread exist. Ex: 4chan
            output_folder: directory where the pictures will be downloaded. Ex: /tmp/4scanner_img
            folder: an optional directory name that can be specified for sorting image in the output_folder. Ex: pictures_of_computers
            is_quiet: suppresses all logging.
            condition: dict used when deciding which pictures to download. Ex: {"width": "=1920", "height": "=1080"}
            check_duplicate: Avoid downloading duplicate that were already downloaded.
            tags: this list of tags will be added a file called $PICTURE_NAME.txt for every pictures to help importing pictures to hydrus network
            throttle: Time to wait, in second, between image downloads
            logger: The logger to use with the class
            single_run: Run the download loop only once, use if you don't want to wait for a thread to 404 before exiting.
        """

        # Getting info about the imageboard URL
        ib_info = imageboard_info.imageboard_info(imageboard)

        base_url = ib_info.base_url
        image_url = ib_info.image_base_url
        thread_subfolder = ib_info.thread_subfolder
        image_subfolder = ib_info.image_subfolder

        # These URL are the url of the thread
        # and the base url where images are stored on the imageboard
        self.thread_url = "{0}{1}{2}{3}.json".format(base_url, board, thread_subfolder, thread_nb)
        self.image_url = "{0}{1}{2}".format(image_url, board, image_subfolder)

        self.tmp_dir = "/tmp/{0}/".format(os.getpid())
        self.curr_time = time.strftime('%d%m%Y-%H%M%S')
        self.pid = os.getpid()
        self.thread = threading.current_thread().name

        self.downloaded_log = "{0}/{1}4scanner_dld-{2}-{3}".format(self.tmp_dir, self.curr_time, self.pid, self.thread)

        self.out_dir = os.path.join(output_folder, 'downloads')

        self.thread_nb = thread_nb
        self.imageboard = imageboard
        self.board = board
        self.condition = condition
        self.check_duplicate = check_duplicate
        self.is_quiet = is_quiet
        self.tags = tags
        self.throttle = int(throttle)

        # Creating the tmp and output directory
        os.makedirs(self.tmp_dir, exist_ok=True)
        os.makedirs(self.out_dir, exist_ok=True)

        self.single_run = single_run

        self.logger = logger
Exemple #6
0
 def get_catalog_json(self, board, chan):
     chan_base_url = imageboard_info.imageboard_info(chan).base_url
     catalog = urllib.request.urlopen(
             "{0}{1}/catalog.json".format(chan_base_url, board))
     try:
         catalog_data = catalog.read()
     except http.client.IncompleteRead as err:
         catalog_data = err.partial
     return json.loads(catalog_data.decode("utf8"))
Exemple #7
0
    def get_catalog_json(self, board: str, chan: str):
        """
        Get the catalog of a given imageboards board as a JSON

        Return:
            catalog info as a dict
        """

        chan_base_url = imageboard_info.imageboard_info(chan).base_url
        catalog = urllib.request.urlopen("{0}{1}/catalog.json".format(
            chan_base_url, board))
        try:
            catalog_data = catalog.read()
        except http.client.IncompleteRead as err:
            catalog_data = err.partial
        return json.loads(catalog_data.decode("utf8"))
Exemple #8
0
    def __init__(self, thread_nb, board, imageboard, output_folder, folder,
                 is_quiet, condition, check_duplicate, tag_list, logger):
        # Getting info about the imageboard URL
        ib_info = imageboard_info.imageboard_info(imageboard)

        base_url = ib_info.base_url
        image_url = ib_info.image_base_url
        thread_subfolder = ib_info.thread_subfolder
        image_subfolder = ib_info.image_subfolder

        # These URL are the url of the thread
        # and the base url where images are stored on the imageboard
        self.thread_url = "{0}{1}{2}{3}.json".format(base_url, board,
                                                     thread_subfolder,
                                                     thread_nb)
        self.image_url = "{0}{1}{2}".format(image_url, board, image_subfolder)

        self.tmp_dir = "/tmp/{0}/".format(os.getpid())
        self.curr_time = time.strftime('%d%m%Y-%H%M%S')
        self.pid = os.getpid()
        self.thread = threading.current_thread().name

        self.downloaded_log = "{0}/{1}4scanner_dld-{2}-{3}".format(
            self.tmp_dir, self.curr_time, self.pid, self.thread)

        self.out_dir = os.path.join(output_folder, 'downloads', imageboard,
                                    board, folder, str(thread_nb))

        self.thread_nb = thread_nb
        self.imageboard = imageboard
        self.board = board
        self.condition = condition
        self.check_duplicate = check_duplicate
        self.is_quiet = is_quiet
        self.tag_list = tag_list

        # Creating the tmp and output directory
        self.create_dir(self.tmp_dir)
        self.create_dir(self.out_dir)

        self.logger = logger
Exemple #9
0
    def __init__(self, thread_nb, board, imageboard, output_folder, folder, is_quiet, condition, check_duplicate, tag_list, logger):
        # Getting info about the imageboard URL
        ib_info = imageboard_info.imageboard_info(imageboard)

        base_url = ib_info.base_url
        image_url = ib_info.image_base_url
        thread_subfolder = ib_info.thread_subfolder
        image_subfolder = ib_info.image_subfolder

        # These URL are the url of the thread
        # and the base url where images are stored on the imageboard
        self.thread_url = "{0}{1}{2}{3}.json".format(base_url, board, thread_subfolder, thread_nb)
        self.image_url = "{0}{1}{2}".format(image_url, board, image_subfolder)

        self.tmp_dir = "/tmp/{0}/".format(os.getpid())
        self.curr_time = time.strftime('%d%m%Y-%H%M%S')
        self.pid = os.getpid()
        self.thread = threading.current_thread().name

        self.downloaded_log = "{0}/{1}4scanner_dld-{2}-{3}".format(self.tmp_dir, self.curr_time, self.pid, self.thread)

        self.out_dir = os.path.join(output_folder, 'downloads', imageboard, board, folder, str(thread_nb))

        self.thread_nb = thread_nb
        self.imageboard = imageboard
        self.board = board
        self.condition = condition
        self.check_duplicate = check_duplicate
        self.is_quiet = is_quiet
        self.tag_list = tag_list

        # Creating the tmp and output directory
        self.create_dir(self.tmp_dir)
        self.create_dir(self.out_dir)

        self.logger = logger
Exemple #10
0
print("!!! dupecheck.add_to_db not tested yet !!!             -")
print("--------------------------------------------------------")

print("--------------------------------------------------------")
print("!!! dupecheck.is_duplicate not tested yet !!!          -")
print("--------------------------------------------------------")

print('\x1b[6;30;42m' + 'All test OK for dupecheck.py' + '\x1b[0m')

print("Testing imageboard_info.py")

print("--------------------------------------------------------")
print("Testing: imageboard_info.get_imageboard_info           -")
print("--------------------------------------------------------")

info_4chan = imageboard_info.imageboard_info("4chan")
if info_4chan.base_url != "http://a.4cdn.org/":
    print("chan_base_url wrong for 4chan")
    exit(1)

if info_4chan.thread_subfolder != "/thread/":
    print("chan_thread_subfolder wrong for 4chan")
    exit(1)

if info_4chan.image_subfolder != "/":
    print("chan_image_subfolder wrong for 4chan")
    exit(1)

if info_4chan.image_base_url != "http://i.4cdn.org/":
    print("chan_image_base_url wrong for 4chan")
    exit(1)
Exemple #11
0
print("!!! dupecheck.add_to_db not tested yet !!!             -")
print("--------------------------------------------------------")

print("--------------------------------------------------------")
print("!!! dupecheck.is_duplicate not tested yet !!!          -")
print("--------------------------------------------------------")

print('\x1b[6;30;42m' + 'All test OK for dupecheck.py' + '\x1b[0m')

print("Testing imageboard_info.py")

print("--------------------------------------------------------")
print("Testing: imageboard_info.get_imageboard_info           -")
print("--------------------------------------------------------")

info_4chan = imageboard_info.imageboard_info("4chan")
if info_4chan.base_url != "http://a.4cdn.org/":
    print("chan_base_url wrong for 4chan")
    exit(1)

if info_4chan.thread_subfolder != "/thread/":
    print("chan_thread_subfolder wrong for 4chan")
    exit(1)

if info_4chan.image_subfolder != "/":
    print("chan_image_subfolder wrong for 4chan")
    exit(1)

if info_4chan.image_base_url != "http://i.4cdn.org/":
    print("chan_image_base_url wrong for 4chan")
    exit(1)