Esempio n. 1
0
def thread_scraper(thread_id, board_name, session, directory):
    thread_id = str(thread_id)
    link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json"
    r = session.get(link)
    if r.status_code == 404:
        return
    thread = json.loads(r.text)
    thread_master = thread["posts"][0]
    if "archived" in thread_master:
        location = "Archive"
    else:
        location = "Catalog"

    if "sub" in thread_master:
        title = thread_master["sub"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return

    if "com" in thread_master:
        title = thread_master["com"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return
    text = ""
    if "sub" in thread_master:
        text = thread_master["sub"][:maximum_length]
    else:
        text = thread_master["com"][:maximum_length]
    found = False
    new_directory = ""
    seen = set()
    for post in thread["posts"]:
        if "name" not in post:
            post["name"] = "Anonymous"
        if "filename" in post:
            ext = post["ext"].replace(".", "")
            filename = main_helper.clean_text(post["filename"])
            if not filename:
                filename = str(post["no"])
            result = main_helper.rename_duplicates(seen, filename)
            seen = result[0]
            file_name = result[1]
            text = main_helper.clean_text(text)
            new_directory = directory+"/"+text+" - "+thread_id+"/"
            if not text:
                new_directory = new_directory.replace(" - ", "")
            date_object = datetime.fromtimestamp(post["time"])
            file_path = main_helper.reformat(new_directory, None, None, file_name,
                                             text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length)
            post["download_path"] = file_path
            found = True
    if found:
        thread["directory"] = new_directory
        return thread
Esempio n. 2
0
def thread_scraper(thread_id, board_name, session, directory):
    thread_id = str(thread_id)
    link = "https://bbw-chan.nl/" + board_name + "/res/" + thread_id + ".json"
    r = session.get(link)
    if r.status_code == 404:
        return
    thread = json.loads(r.text)
    thread_master = thread
    if "archived" in thread_master:
        location = "Archive"
    else:
        location = "Catalog"
    text = ""
    if thread_master["subject"]:
        title = thread_master["subject"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return
        else:
            text = thread_master["subject"][:maximum_length]

    if thread_master["message"]:
        title = thread_master["message"].lower()
        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
            print("Removed From "+location+": ", title)
            return
        else:
            if not text:
                text = thread_master["message"][:maximum_length]
    thread_master2 = thread_master.copy()
    for key in thread_master2:
        if "posts" != key:
            del thread_master[key]
    del thread_master2["posts"]
    thread["download_path"] = ""
    thread["posts"] = [thread_master2]+thread_master["posts"]
    found = False
    new_directory = ""
    for post in thread["posts"]:
        date_object = datetime.strptime(
            post["creation"], "%Y-%m-%dT%H:%M:%S.%fZ")
        post["creation"] = date_object.timestamp()
        for media in post["files"]:
            ext = media["mime"].split("/")[1]
            media["ext"] = ext
            file_name = os.path.splitext(media["originalName"])[0].strip()
            text = main_helper.clean_text(text)
            new_directory = directory+"/"+text+" - "+thread_id+"/"
            if not text:
                new_directory = new_directory.replace(" - ", "")
            file_path = main_helper.reformat(new_directory, None, None, file_name,
                                                text, ext, date_object, post["name"], file_directory_format, file_name_format, date_format, maximum_length)
            media["download_path"] = file_path
            found = True
    if found:
        thread["directory"] = new_directory
        return thread
Esempio n. 3
0
 def __init__(self, option):
     self.directory = option.get('directory')
     self.post_id = option.get('post_id', "")
     self.media_id = option.get('media_id', "")
     self.filename = filename
     self.text = main_helper.clean_text(option.get('text', ""))
     self.ext = option.get('ext', ext)
     self.date = option.get('postedAt', today)
     self.username = option.get('username', username)
     self.format_path = format_path
     self.date_format = date_format
     self.maximum_length = int(text_length)
Esempio n. 4
0
def media_scraper(link, session, directory, username, api_type):
    media_set = [[], []]
    media_type = directory[-1]
    y = json_request(session, link)
    if not y or "error" in y:
        return media_set
    x = 0
    if api_type == "Highlights":
        y = y["stories"]
    if api_type == "Messages":
        y = y["list"]
    if api_type == "Mass Messages":
        y = y["list"]
    master_date = "01-01-0001 00:00:00"
    for media_api in y:
        if api_type == "Mass Messages":
            media_user = media_api["fromUser"]
            media_username = media_user["username"]
            if media_username != username:
                continue
        for media in media_api["media"]:
            date = "-001-11-30T00:00:00+00:00"
            size = 0
            if "source" in media:
                source = media["source"]
                link = source["source"]
                size = media["info"]["preview"][
                    "size"] if "info" in media_api else 1
                date = media_api[
                    "postedAt"] if "postedAt" in media_api else media_api[
                        "createdAt"]
            if "src" in media:
                link = media["src"]
                size = media["info"]["preview"][
                    "size"] if "info" in media_api else 1
                date = media_api["createdAt"]
            if not link:
                continue
            matches = ["us", "uk", "ca", "ca2", "de"]

            url = urlparse(link)
            subdomain = url.hostname.split('.')[0]
            preview_link = media["preview"]
            if any(subdomain in nm for nm in matches):
                subdomain = url.hostname.split('.')[1]
                if "upload" in subdomain:
                    continue
                if "convert" in subdomain:
                    link = preview_link
            rules = [link == "", preview_link == ""]
            if all(rules):
                continue
            new_dict = dict()
            new_dict["post_id"] = media_api["id"]
            new_dict["media_id"] = media["id"]
            new_dict["links"] = []
            for xlink in link, preview_link:
                if xlink:
                    new_dict["links"].append(xlink)
            new_dict[
                "price"] = media_api["price"] if "price" in media_api else None
            if date == "-001-11-30T00:00:00+00:00":
                date_string = master_date
                date_object = datetime.strptime(master_date,
                                                "%d-%m-%Y %H:%M:%S")
            else:
                date_object = datetime.fromisoformat(date)
                date_string = date_object.replace(
                    tzinfo=None).strftime("%d-%m-%Y %H:%M:%S")
                master_date = date_string

            if media["type"] not in media_type:
                x += 1
                continue
            if "rawText" not in media_api:
                media_api["rawText"] = ""
            text = media_api["rawText"] if media_api["rawText"] else ""
            matches = [s for s in ignored_keywords if s in text]
            if matches:
                print("Matches: ", matches)
                continue
            text = clean_text(text)
            new_dict["postedAt"] = date_string
            post_id = new_dict["post_id"]
            media_id = new_dict["media_id"]
            file_name = link.rsplit('/', 1)[-1]
            file_name, ext = os.path.splitext(file_name)
            ext = ext.__str__().replace(".", "").split('?')[0]
            file_path = reformat(directory[0][1], post_id, media_id, file_name,
                                 text, ext, date_object, username, format_path,
                                 date_format, maximum_length)
            new_dict["text"] = text
            new_dict["paid"] = False
            if new_dict["price"]:
                if api_type in ["Messages", "Mass Messages"]:
                    new_dict["paid"] = True
                else:
                    if media["id"] not in media_api["preview"] and media[
                            "canView"]:
                        new_dict["paid"] = True
            new_dict["directory"] = os.path.join(directory[0][1])
            if sort_free_paid_posts:
                new_dict["directory"] = os.path.join(directory[1][1])
                if new_dict["paid"]:
                    new_dict["directory"] = os.path.join(directory[2][1])
            new_dict["filename"] = file_path.rsplit('/', 1)[-1]
            new_dict["size"] = size
            if size == 0:
                media_set[1].append(new_dict)
                continue
            media_set[0].append(new_dict)
    return media_set
Esempio n. 5
0
    async def reformat_2(self, unformatted: Path):
        post_id = self.post_id
        media_id = self.media_id
        date = self.date
        text = self.text
        value = "Free"
        maximum_length = self.maximum_length
        text_length = self.text_length
        post_id = "" if post_id is None else str(post_id)
        media_id = "" if media_id is None else str(media_id)
        unformatted_string = unformatted.as_posix()
        extra_count = 0
        if type(date) is str:
            format_variables2 = format_attributes()
            if date != format_variables2.date and date != "":
                date = datetime.strptime(date, "%d-%m-%Y %H:%M:%S")
                date = date.strftime(self.date_format)
        else:
            if isinstance(date, datetime):
                date = date.strftime(self.date_format)
        has_text = False
        if "{text}" in unformatted_string:
            has_text = True
            text = main_helper.clean_text(text)
            extra_count = len("{text}")
        if "{value}" in unformatted_string:
            if self.price:
                if not self.preview:
                    value = "Paid"
        directory = self.directory
        if not directory:
            raise Exception("Directory not found")
        path = unformatted_string.replace("{site_name}", self.site_name)
        path = path.replace("{first_letter}", self.model_username[0].capitalize())
        path = path.replace("{post_id}", post_id)
        path = path.replace("{media_id}", media_id)
        path = path.replace("{profile_username}", self.profile_username)
        path = path.replace("{model_username}", self.model_username)
        path = path.replace("{api_type}", self.api_type)
        path = path.replace("{media_type}", self.media_type)
        path = path.replace("{filename}", self.filename)
        path = path.replace("{ext}", self.ext)
        path = path.replace("{value}", value)
        path = path.replace("{date}", date)
        directory_count = len(str(directory))
        path_count = len(path)
        maximum_length = maximum_length - (directory_count + path_count - extra_count)
        text_length = text_length if text_length < maximum_length else maximum_length
        if has_text:
            # https://stackoverflow.com/a/43848928
            def utf8_lead_byte(b: int):
                """A UTF-8 intermediate byte starts with the bits 10xxxxxx."""
                return (b & 0xC0) != 0x80

            def utf8_byte_truncate(text: str, max_bytes: int):
                """If text[max_bytes] is not a lead byte, back up until a lead byte is
                found and truncate before that character."""
                utf8 = text.encode("utf8")
                if len(utf8) <= max_bytes:
                    return utf8
                i = max_bytes
                while i > 0 and not utf8_lead_byte(utf8[i]):
                    i -= 1
                return utf8[:i]

            filtered_text = utf8_byte_truncate(text, text_length).decode("utf8")
            path = path.replace("{text}", filtered_text)
        else:
            path = path.replace("{text}", "")
        x_path = directory.joinpath(path)
        return x_path
Esempio n. 6
0
def media_scraper(result, sessions, formatted_directories, username, api_type):
    link = result["link"]
    session = sessions[result["count"]]
    media_set = []
    y = main_helper.json_request(session, link)
    if not y or "error" in y:
        return media_set
    x = 0
    if api_type == "Highlights":
        y = y["stories"]
    if api_type == "Messages":
        y = y["list"]
    if api_type == "Mass Messages":
        y = y["list"]
    model_directory = formatted_directories["model_directory"]
    for location in formatted_directories["locations"]:
        sorted_directories = location["sorted_directories"]
        master_date = "01-01-0001 00:00:00"
        media_type = location["media_type"]
        alt_media_type = location["alt_media_type"]
        if result["count"] == 0:
            seperator = " | "
            print("Scraping ["+str(seperator.join(alt_media_type)) +
                  "]. Should take less than a minute.")
        media_set2 = {}
        media_set2["type"] = media_type
        media_set2["valid"] = []
        media_set2["invalid"] = []
        for media_api in y:
            if api_type == "Messages":
                media_api["rawText"] = media_api["text"]
            if api_type == "Mass Messages":
                media_user = media_api["fromUser"]
                media_username = media_user["username"]
                if media_username != username:
                    continue
            for media in media_api["media"]:
                date = "-001-11-30T00:00:00+00:00"
                size = 0
                if "source" in media:
                    source = media["source"]
                    link = source["source"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"]
                if "src" in media:
                    link = media["src"]
                    size = media["info"]["preview"]["size"] if "info" in media_api else 1
                    date = media_api["createdAt"]
                if not link:
                    continue
                matches = ["us", "uk", "ca", "ca2", "de"]

                url = urlparse(link)
                subdomain = url.hostname.split('.')[0]
                preview_link = media["preview"]
                if any(subdomain in nm for nm in matches):
                    subdomain = url.hostname.split('.')[1]
                    if "upload" in subdomain:
                        continue
                    if "convert" in subdomain:
                        link = preview_link
                rules = [link == "",
                         preview_link == ""]
                if all(rules):
                    continue
                new_dict = dict()
                new_dict["post_id"] = media_api["id"]
                new_dict["media_id"] = media["id"]
                new_dict["links"] = []
                for xlink in link, preview_link:
                    if xlink:
                        new_dict["links"].append(xlink)
                        break
                new_dict["price"] = media_api["price"]if "price" in media_api else None
                if date == "-001-11-30T00:00:00+00:00":
                    date_string = master_date
                    date_object = datetime.strptime(
                        master_date, "%d-%m-%Y %H:%M:%S")
                else:
                    date_object = datetime.fromisoformat(date)
                    date_string = date_object.replace(tzinfo=None).strftime(
                        "%d-%m-%Y %H:%M:%S")
                    master_date = date_string

                if media["type"] not in alt_media_type:
                    x += 1
                    continue
                if "rawText" not in media_api:
                    media_api["rawText"] = ""
                text = media_api["rawText"] if media_api["rawText"] else ""
                matches = [s for s in ignored_keywords if s in text]
                if matches:
                    print("Matches: ", matches)
                    continue
                text = main_helper.clean_text(text)
                new_dict["postedAt"] = date_string
                post_id = new_dict["post_id"]
                media_id = new_dict["media_id"]
                file_name = link.rsplit('/', 1)[-1]
                file_name, ext = os.path.splitext(file_name)
                ext = ext.__str__().replace(".", "").split('?')[0]
                media_directory = os.path.join(
                    model_directory, sorted_directories["unsorted"])
                new_dict["paid"] = False
                if new_dict["price"]:
                    if api_type in ["Messages", "Mass Messages"]:
                        new_dict["paid"] = True
                    else:
                        if media["id"] not in media_api["preview"] and media["canView"]:
                            new_dict["paid"] = True
                if sort_free_paid_posts:
                    media_directory = os.path.join(
                        model_directory, sorted_directories["free"])
                    if new_dict["paid"]:
                        media_directory = os.path.join(
                            model_directory, sorted_directories["paid"])
                file_path = main_helper.reformat(media_directory, post_id, media_id, file_name,
                                                 text, ext, date_object, username, file_directory_format, file_name_format, date_format, maximum_length)
                new_dict["text"] = text
                file_directory = os.path.dirname(file_path)
                new_dict["directory"] = os.path.join(file_directory)
                new_dict["filename"] = os.path.basename(file_path)
                new_dict["size"] = size
                if size == 0:
                    media_set2["invalid"].append(new_dict)
                    continue
                new_dict["session"] = session
                media_set2["valid"].append(new_dict)
        media_set.append(media_set2)
    return media_set