Beispiel #1
0
 def index(self, config) -> LocalElementsIndex:
     error = config["error"] if "error" in config else ""
     if error == "index":
         raise SelectorIndexError("test")
     else:
         elements = ["skip", "retry3", "retryN", "pass"]
         return LocalElementsIndex(rows=scaffold_elementmap(elements))
Beispiel #2
0
 def index(self, _) -> LocalElementsIndex:
     results = self._run()
     if len(results) > 0:
         out = []
         out.append(list(results[0].keys()))
         out.extend([x.values() for x in results])
         return LocalElementsIndex(out)
     return None
Beispiel #3
0
    def index(self, config):
        if not os.path.exists(self.disk.read_query(self.name)):
            df = scaffold_elementmap(["el1", "el2", "el3"])

            df = [
                x + [STUB_PATHS.imagejpg] if idx > 0 else (x + ["path"])
                for idx, x in enumerate(df)
            ]
            return LocalElementsIndex(rows=df)
        else:
            return None
Beispiel #4
0
    def index(self, config):
        c = twint.Config()
        c.Search = config["search_term"]
        c.Since = config["uploaded_after"]
        c.Until = config["uploaded_before"]
        c.Show_hashtags = True
        c.Store_object = True

        twint.run.Search(c)

        tweets = to_serializable(twint.output.tweets_list, as_list=True)
        return LocalElementsIndex(tweets)
Beispiel #5
0
    def read_elements_index(self, q: str) -> LocalElementsIndex:
        dest = self.read_query(q)

        def get_rows():
            with open(dest / self.ELEMENTS_INDEX_FILE, "r",
                      encoding="utf-8") as f:
                reader = csv.reader(f)
                for idx, row in enumerate(reader):
                    if idx == 0:
                        self.headers = row
                        continue
                    obj = Ns()
                    allvls = dict(zip(self.headers, row))
                    obj = Ns(**allvls)

                    yield obj

        return LocalElementsIndex(rows=get_rows())
Beispiel #6
0
    def index(self, config):
        viable_boards = [
            "a",
            "aco",
            "adv",
            "an",
            "asp",
            "b",
            "bant",
            "biz",
            "c",
            "cgl",
            "ck",
            "cm",
            "co",
            "d",
            "diy",
            "e",
            "f",
            "fa",
            "fit",
            "g",
            "gd",
            "gif",
            "h",
            "hc",
            "his",
            "hm",
            "hr",
            "i",
            "ic",
            "int",
            "jp",
            "k",
            "lgbt",
            "lit",
            "m",
            "mlp",
            "mu",
            "n",
            "news",
            "o",
            "out",
            "p",
            "po",
            "pol",
            "qa",
            "qst",
            "r",
            "r9k",
            "s",
            "s4s",
            "sci",
            "soc",
            "sp",
            "t",
            "tg",
            "toy",
            "trash",
            "trv",
            "tv",
            "u",
            "v",
            "vg",
            "vip",
            "vp",
            "vr",
            "w",
            "wg",
            "wsg",
            "wsr",
            "x",
            "y",
        ]
        results = []
        board = config["board"]
        if board not in viable_boards:
            self.error_logger("Your chosen board does not exist on 4chan!")
            quit()
        # Create a HTML parser for parsing comments
        h = html2text.HTML2Text()
        h.ignore_links = False

        req = f"https://a.4cdn.org/{board}/threads.json"

        content = json.loads(requests.get(req).content)
        max_pages = max(1, min(len(content), int(config["max_pages"])))

        for page_index in range(max_pages):
            page = content[page_index]
            self.logger(f"Scraping page number: {page_index+1}")
            for thread_index, threads in enumerate(page["threads"]):
                self.logger(
                    f"Extracting posts from thread number: {thread_index+1}")
                thread_id = threads["no"]
                req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json"
                thread_content = json.loads(requests.get(req).content)[
                    "posts"]  # thread content is a list of posts
                for post_index, post in enumerate(thread_content):
                    self.logger(
                        f"Extracting media and comments from post number: {post_index+1}"
                    )
                    post_row = []
                    post_row.append(post["no"])
                    post_row.append(thread_id)
                    post_row.append(post["time"])

                    try:
                        comment = post["com"]
                    except KeyError:
                        comment = "..."
                    else:
                        comment = h.handle(comment)
                    post_row.append(comment)

                    # Filename
                    try:
                        filename = post["filename"]
                    except KeyError:
                        filename = ""

                    if filename != "":
                        time_id = post["tim"]
                        extension = post["ext"]
                        full_file = f"{filename}{extension}"
                        file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}"
                        post_row.append(full_file)
                        post_row.append(extension)
                        post_row.append(file_url)
                    elif filename == "":
                        post_row.append("")
                        post_row.append("")
                        post_row.append("")

                    results.append(post_row)

        self.logger("Scraping metadata complete")
        results.insert(0, [
            "id", "thread_id", "datetime", "comment", "filename", "ext", "url"
        ])
        return LocalElementsIndex(results)
Beispiel #7
0
    def index(self, config):
        results = []
        board = config["board"]
        if board not in viable_boards:
            self.error_logger("Your chosen board does not exist on 4chan!")
            quit()
        # Create a HTML parser for parsing comments
        h = html2text.HTML2Text()
        h.ignore_links = False

        req = f"https://a.4cdn.org/{board}/threads.json"

        content = json.loads(requests.get(req).content)
        for page_index, page in enumerate(content):
            self.logger(f"Scraping page number: {page_index+1}")
            for thread_index, threads in enumerate(page["threads"]):
                self.logger(
                    f"Extracting posts from thread number: {thread_index+1}")
                thread_id = threads["no"]
                req = f"https://a.4cdn.org/{board}/thread/{thread_id}.json"
                thread_content = json.loads(requests.get(req).content)[
                    "posts"]  # thread content is a list of posts
                for post_index, post in enumerate(thread_content):
                    self.logger(
                        f"Extracting media and comments from post number: {post_index+1}"
                    )
                    post_row = []
                    post_row.append(post["no"])
                    post_row.append(thread_id)
                    post_row.append(post["time"])

                    try:
                        comment = post["com"]
                    except KeyError:
                        comment = "..."
                    else:
                        comment = h.handle(comment)
                    post_row.append(comment)

                    # Filename
                    try:
                        filename = post["filename"]
                    except KeyError:
                        filename = ""

                    if filename != "":
                        time_id = post["tim"]
                        extension = post["ext"]
                        full_file = f"{filename}{extension}"
                        file_url = f"https://i.4cdn.org/{board}/{time_id}{extension}"
                        post_row.append(full_file)
                        post_row.append(extension)
                        post_row.append(file_url)
                    elif filename == "":
                        post_row.append("")
                        post_row.append("")
                        post_row.append("")
                    results.append(post_row)
        self.logger("Scraping metadata complete")
        results.insert(0, [
            "id", "thread_id", "datetime", "comment", "filename", "ext", "url"
        ])
        return LocalElementsIndex(results)