async def monitor_messages(self): print("Connecting...") await self._driver.connect() # get qr code print("Checking for login...") login_status = await self._driver.wait_for_login() if not login_status: filepath = await self._driver.get_qr() print("The QR is at", filepath.replace("/tmp", "qrs")) # wait for user to login tries = 0 while not login_status: print("Wait for login...") login_status = await self._driver.wait_for_login() if tries > 30: raise Exception("Couldn't login") else: tries += 1 await self._driver.save_firefox_profile(remove_old=True) self._db.add_json() while True: try: print("Checking for more messages, status", await self._driver.get_status()) for cnt in await self.get_unread_messages(): if self.is_cancelled: break for message in cnt.messages: if isinstance(message, Message): shit = message.get_js_obj() name = message.sender.push_name if name is None: name = message.sender.get_safe_name() chat = shit['chat']['contact']['formattedName'] # By default, message should be sent newline = "\n" if (links := get_links(message.content) ) and not check_filter(message.content): try: self._tg.log_link( chat, name, f"{newline.join(links)}\n\n---\n\n{message.content}" ) except Exception as e: self._tg.log_message( f"New invite link failed to deliver!, Check phone asap | error log_message = {e}" ) except Exception as e: print(e) continue await self.sleep(3)
def __init__(self) -> None: print_app_info() self.browser: object = make_browser(headless=False) self.browser.get(get_defaults("url", "login")) self.scrape_data_amount: int = get_inputs(self.browser) self.filters: dict = get_filters(self.browser) self.result_links: list = get_links( self.browser, self.scrape_data_amount) self.scraped_details: dict = get_data( self.browser, self.result_links, self.scrape_data_amount) self.scraped_details["filters"] = self.filters quit_browser(self.browser) file_name = "./ScrapedData/ScrapedData_{}".format( time.strftime("%d_%b_%Y_%H_%M", time.localtime())) make_json(self.scraped_details, f"{file_name}.json") make_json_to_csv( f"{file_name}.json", f"{file_name}.csv" ) print_app_end()
def __init__(self, mail_bytes: bytes): """ Initialize the object :param mail_bytes: bytes object representing the mail """ email = message_from_bytes(mail_bytes) # get sender self.sender, _ = decode_header(email.get('From'))[0] if isinstance(self.sender, bytes): self.sender = self.sender.decode('utf-8') # get subject self.subject = decode_header(email['Subject'])[0][0] if isinstance(self.subject, bytes): self.subject = self.subject.decode('utf-8') # get message body if email.is_multipart(): body = "" for part in email.walk(): if "text" in part.get_content_type(): try: body += part.get_payload(decode=True).decode('utf-8') except: continue else: body = email.get_payload(decode=True).decode('utf-8') # check if message is to be filtered if check_filter(body) or check_filter(self.subject): self.links = set() else: # remove all quoted messages and footers to get just the message content body = sub( r"On (Mon|Tue|Wed|Thu|Fri|Sat|Sun), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d+, \d+ at \d+:\d+ [AP]M [\w\s]+ <.+@.+> wrote:.*", "", body, flags=DOTALL, ) body = body.split("--")[0] # get all needed links from body self.links: Set[str] = get_links(body)
def links(url): html_tree = helpers.open_url(url) links = helpers.get_links(html_tree) dict = {'links': links} return jsonify(dict)
filename = "graph.txt" graph = DiGraph() if os.path.isfile(filename): graph.read_from_filename(filename) if graph.nodes: to_traverse = graph.get_to_traverse() else: root_pagename = "Kevin_Bacon" to_traverse = {root_pagename} count = len(graph.nodes) # traverse whole graph with open(filename, "a+", encoding="utf-8") as file: while to_traverse: pagename = to_traverse.pop() graph.add_node(pagename) count += 1 print(f"Traversing {pagename}") print(count) for link in get_links(pagename): graph.add_link(pagename, link) if link in graph.nodes: continue to_traverse.add(link) # BFS graph.write_node_to_file(file, pagename)
def link_crawler(start_url, link_regex, robots_url=None, user_agent='statista', max_depth=-1, delay=3, proxies=None, num_retries=2, cache=None, scraper_callback=None): #: Initialze a crawl queue with a seed url to start the crawl from crawl_queue = [start_url] #: keep track of seen urls seen = {} robots = {} throttle = Throttle(delay) #: start the crawl while crawl_queue: url = crawl_queue.pop() #: robots.txt robots_file_present = False if 'http' not in url: continue #: Get the domain domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc) #: Get the robot parser for this domain from the robots dictionary robot_parser = robots.get(domain) #: set a default robots url and a parser for it if there isn't one if not robot_parser and domain not in robots: robots_url = '{}/robots.txt'.format(domain) robot_parser = get_robots_parser(robots_url) if not robot_parser: #: continue to crawl even if there are problems finding robots.txt #: file robots_file_present = True # associate each domain with a corresponding parser, whether # present or not robots[domain] = robot_parser elif domain in robots: robots_file_present = True #: crawl only when url passes robots.txt restrictions if robots_file_present or robot_parser.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: #: Skip link if you have crawled it more than max depth print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, num_retries=num_retries) if not html: continue if scraper_callback: scraper_callback(url, html) #: Get all links from page and filter only those matching given pattern for link in get_links(html): if re.search(link_regex, link): if 'http' not in link: # check if link is well formed and correct if link.startswith('//'): link = '{}:{}'.format(urlparse(url).scheme, link) elif link.startswith('://'): link = '{}{}'.format(urlparse(url).scheme, link) else: link = urljoin(domain, link) if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) else: print('Blocked by robots.txt:', url)