Example #1
0
    async def monitor_messages(self):
        print("Connecting...")
        await self._driver.connect()

        # get qr code
        print("Checking for login...")
        login_status = await self._driver.wait_for_login()
        if not login_status:
            filepath = await self._driver.get_qr()
            print("The QR is at", filepath.replace("/tmp", "qrs"))

        # wait for user to login
        tries = 0
        while not login_status:
            print("Wait for login...")
            login_status = await self._driver.wait_for_login()
            if tries > 30:
                raise Exception("Couldn't login")
            else:
                tries += 1

        await self._driver.save_firefox_profile(remove_old=True)
        self._db.add_json()

        while True:
            try:
                print("Checking for more messages, status", await
                      self._driver.get_status())

                for cnt in await self.get_unread_messages():
                    if self.is_cancelled:
                        break

                    for message in cnt.messages:
                        if isinstance(message, Message):
                            shit = message.get_js_obj()
                            name = message.sender.push_name
                            if name is None:
                                name = message.sender.get_safe_name()
                            chat = shit['chat']['contact']['formattedName']

                            # By default, message should be sent
                            newline = "\n"
                            if (links := get_links(message.content)
                                ) and not check_filter(message.content):
                                try:
                                    self._tg.log_link(
                                        chat, name,
                                        f"{newline.join(links)}\n\n---\n\n{message.content}"
                                    )
                                except Exception as e:
                                    self._tg.log_message(
                                        f"New invite link failed to deliver!, Check phone asap | error log_message = {e}"
                                    )

            except Exception as e:
                print(e)
                continue

            await self.sleep(3)
    def __init__(self) -> None:
        print_app_info()

        self.browser: object = make_browser(headless=False)

        self.browser.get(get_defaults("url", "login"))

        self.scrape_data_amount: int = get_inputs(self.browser)

        self.filters: dict = get_filters(self.browser)

        self.result_links: list = get_links(
            self.browser, self.scrape_data_amount)

        self.scraped_details: dict = get_data(
            self.browser, self.result_links, self.scrape_data_amount)

        self.scraped_details["filters"] = self.filters

        quit_browser(self.browser)

        file_name = "./ScrapedData/ScrapedData_{}".format(
            time.strftime("%d_%b_%Y_%H_%M", time.localtime()))

        make_json(self.scraped_details, f"{file_name}.json")

        make_json_to_csv(
            f"{file_name}.json",
            f"{file_name}.csv"
        )

        print_app_end()
Example #3
0
    def __init__(self, mail_bytes: bytes):
        """
        Initialize the object
        :param mail_bytes: bytes object representing the mail
        """
        email = message_from_bytes(mail_bytes)

        # get sender
        self.sender, _ = decode_header(email.get('From'))[0]
        if isinstance(self.sender, bytes):
            self.sender = self.sender.decode('utf-8')

        # get subject
        self.subject = decode_header(email['Subject'])[0][0]
        if isinstance(self.subject, bytes):
            self.subject = self.subject.decode('utf-8')

        # get message body
        if email.is_multipart():
            body = ""
            for part in email.walk():
                if "text" in part.get_content_type():
                    try:
                        body += part.get_payload(decode=True).decode('utf-8')
                    except:
                        continue
        else:
            body = email.get_payload(decode=True).decode('utf-8')

        # check if message is to be filtered
        if check_filter(body) or check_filter(self.subject):
            self.links = set()
        else:
            # remove all quoted messages and footers to get just the message content
            body = sub(
                r"On (Mon|Tue|Wed|Thu|Fri|Sat|Sun), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d+, \d+ at \d+:\d+ [AP]M [\w\s]+ <.+@.+> wrote:.*",
                "",
                body,
                flags=DOTALL,
            )
            body = body.split("--")[0]

            # get all needed links from body
            self.links: Set[str] = get_links(body)
Example #4
0
def links(url):
    html_tree = helpers.open_url(url)
    links = helpers.get_links(html_tree)
    dict = {'links': links}
    return jsonify(dict)
filename = "graph.txt"

graph = DiGraph()

if os.path.isfile(filename):
    graph.read_from_filename(filename)

if graph.nodes:
    to_traverse = graph.get_to_traverse()
else:
    root_pagename = "Kevin_Bacon"
    to_traverse = {root_pagename}
count = len(graph.nodes)

# traverse whole graph
with open(filename, "a+", encoding="utf-8") as file:
    while to_traverse:
        pagename = to_traverse.pop()
        graph.add_node(pagename)
        count += 1

        print(f"Traversing {pagename}")
        print(count)

        for link in get_links(pagename):
            graph.add_link(pagename, link)
            if link in graph.nodes: continue
            to_traverse.add(link)  # BFS

        graph.write_node_to_file(file, pagename)
Example #6
0
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='statista',
                 max_depth=-1,
                 delay=3,
                 proxies=None,
                 num_retries=2,
                 cache=None,
                 scraper_callback=None):

    #: Initialze a crawl queue with a seed url to start the crawl from
    crawl_queue = [start_url]

    #: keep track of seen urls
    seen = {}

    robots = {}

    throttle = Throttle(delay)

    #: start the crawl
    while crawl_queue:
        url = crawl_queue.pop()

        #: robots.txt
        robots_file_present = False
        if 'http' not in url:
            continue

        #: Get the domain
        domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)

        #: Get the robot parser for this domain from the robots dictionary
        robot_parser = robots.get(domain)

        #: set a default robots url and a parser for it if there isn't one
        if not robot_parser and domain not in robots:
            robots_url = '{}/robots.txt'.format(domain)
            robot_parser = get_robots_parser(robots_url)
            if not robot_parser:
                #: continue to crawl even if there are problems finding robots.txt
                #: file
                robots_file_present = True
            # associate each domain with a corresponding parser, whether
            # present or not
            robots[domain] = robot_parser

        elif domain in robots:
            robots_file_present = True

        #: crawl only when url passes robots.txt restrictions
        if robots_file_present or robot_parser.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                #: Skip link if you have crawled it more than max depth
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url, num_retries=num_retries)
            if not html:
                continue
            if scraper_callback:
                scraper_callback(url, html)

            #: Get all links from page and filter only those matching given pattern
            for link in get_links(html):
                if re.search(link_regex, link):
                    if 'http' not in link:
                        # check if link is well formed and correct
                        if link.startswith('//'):
                            link = '{}:{}'.format(urlparse(url).scheme, link)
                        elif link.startswith('://'):
                            link = '{}{}'.format(urlparse(url).scheme, link)
                        else:
                            link = urljoin(domain, link)

                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
        else:
            print('Blocked by robots.txt:', url)