def __init__(self, hostname=None, logger=None): self.__hostname = hostname if hostname is not None else "http://127.0.0.0:5000" self.__log = setup_console_logger(logger, "CallConnectHandler") self.__jh = JsonHandler(hostname=self.__hostname, logger=self.__log) self.__hh = HttpHandler(hostname=self.__hostname, logger=self.__log) self.__log.debug("CallConnect handler initialized for: %s" % self.__hostname)
class FlaskrilioHandler: """A simple wrapper for the local Flaskrilio service""" def __init__(self, hostname=None, logger=None): self.__hostname = hostname if hostname is not None else "http://127.0.0.0:5000" self.__log = setup_console_logger(logger, "FlaskrilioHandler") self.__jh = JsonHandler(hostname=self.__hostname, logger=self.__log) self.__hh = HttpHandler(hostname=self.__hostname, logger=self.__log) self.__log.debug("Flaskrilio handler initialized for: %s" % self.__hostname) def get_home(self): return self.__jh.get(endpoint="/") def get_twiml(self, ctx): self.__log.debug("Getting TwilML for an endpoint: %s" % ctx) return self.__hh.get(endpoint=ctx) def get_calls(self): calls = self.__jh.get(endpoint="/calls") self.__log.debug("Got calls: %s" % calls) if calls is None: return [] else: return calls
def crawl_worker(self, url): """ This is the task that is being executed by the threadpool executor when it receive a url to do it's job :param url: Url that is supposed to be getting links and assets from :return: site_map_entry - dict(), links_with_issues_entry - set() """ if url is None: raise ValueError("Url=%s has a None value" % url) site_map_entry = dict() links_with_issues_entry = set() module_logger.info("Working on url=%s" % url) try: # Make a relative url into an absolute url access_link = LinkHandler.reconstruct_link(self.start_url, url) if access_link is None: raise ValueError("Access link value: %s" % access_link) elif not DomainRule.apply(self.start_url, access_link): module_logger.info("url=%s is not in the same domain as %s" % (access_link, self.start_url)) module_logger.debug("Start url=%s site_map_entry=%s links_with_issues_entry=%s" % (self.start_url, site_map_entry, links_with_issues_entry)) return None, links_with_issues_entry module_logger.debug("Going to open access_link=%s" % access_link) except Exception as err: module_logger.warn(err) site_map_entry[url] = {'links': set(), 'assets': set()} return None, links_with_issues_entry try: content = HttpHandler.fetch_url_content(access_link) if content is None: raise ValueError("Content of the url=%s is None" % url) except ValueError as err: module_logger.warn(err) return None, links_with_issues_entry except Exception as err: module_logger.warn(err) return None, links_with_issues_entry links, assets = PageParser.parse_page_get_links(content) site_map_entry[url] = {'links': links, 'assets': assets} module_logger.info("Completed working on url=%s" % url) module_logger.info("SiteMap=%s" % site_map_entry) module_logger.info("Links with issues=%s" % links) return site_map_entry, links_with_issues_entry
class CallConnectHandler: """A simple wrapper for Call Connect endpoints""" def __init__(self, hostname=None, logger=None): self.__hostname = hostname if hostname is not None else "http://127.0.0.0:5000" self.__log = setup_console_logger(logger, "CallConnectHandler") self.__jh = JsonHandler(hostname=self.__hostname, logger=self.__log) self.__hh = HttpHandler(hostname=self.__hostname, logger=self.__log) self.__log.debug("CallConnect handler initialized for: %s" % self.__hostname) def get_new_caller_id(self): caller_id = self.__jh.post(endpoint="/api/id", data="{}").json()['id'] self.__log.debug("Received new Caller ID: %s" % caller_id) return caller_id def get_redirect_to(self, caller_id, number): payload = { "id": caller_id, "redirectTo": number } # use json.dumps to convert payload tupple into a string redir = self.__jh.post(endpoint="/api/callers", data=json.dumps(payload)) self.__log.debug("POST %s/api/callers payload: %s" % (self.__hostname, payload)) self.__log.debug("Received new redirect_to: %s" % redir.json()) return redir def delete_caller_id(self, callerId): self.__log.debug("Deleting callerId: %s" % callerId) return self.__hh.delete(endpoint="/api/callers/%s" % callerId) def get_callers_details(self, callerId): self.__log.debug("Getting details for callerId: %s" % callerId) return self.__jh.get(endpoint="/api/callers/%s" % callerId) def get_number_pool(self): pool = self.__jh.get(endpoint="/api/pool") self.__log.debug("Got Number pool: %s" % pool.json()) return pool
class CallConnectHandler: """A simple wrapper for Call Connect endpoints""" def __init__(self, hostname=None, logger=None): self.__hostname = hostname if hostname is not None else "http://127.0.0.0:5000" self.__log = setup_console_logger(logger, "CallConnectHandler") self.__jh = JsonHandler(hostname=self.__hostname, logger=self.__log) self.__hh = HttpHandler(hostname=self.__hostname, logger=self.__log) self.__log.debug("CallConnect handler initialized for: %s" % self.__hostname) def get_new_caller_id(self): caller_id = self.__jh.post(endpoint="/api/id", data="{}").json()['id'] self.__log.debug("Received new Caller ID: %s" % caller_id) return caller_id def get_redirect_to(self, caller_id, number): payload = {"id": caller_id, "redirectTo": number} # use json.dumps to convert payload tupple into a string redir = self.__jh.post(endpoint="/api/callers", data=json.dumps(payload)) self.__log.debug("POST %s/api/callers payload: %s" % (self.__hostname, payload)) self.__log.debug("Received new redirect_to: %s" % redir.json()) return redir def delete_caller_id(self, callerId): self.__log.debug("Deleting callerId: %s" % callerId) return self.__hh.delete(endpoint="/api/callers/%s" % callerId) def get_callers_details(self, callerId): self.__log.debug("Getting details for callerId: %s" % callerId) return self.__jh.get(endpoint="/api/callers/%s" % callerId) def get_number_pool(self): pool = self.__jh.get(endpoint="/api/pool") self.__log.debug("Got Number pool: %s" % pool.json()) return pool
def __init__(self, hostname=None, logger=None): self.__hostname = hostname if hostname is not None else "http://127.0.0.0:5000" self.__log = setup_console_logger(logger, "FlaskrilioHandler") self.__jh = JsonHandler(hostname=self.__hostname, logger=self.__log) self.__hh = HttpHandler(hostname=self.__hostname, logger=self.__log) self.__log.debug("Flaskrilio handler initialized for: %s" % self.__hostname)
def crawl(self, start_url=None): """ Single threaded webcrawler. :param start_url: Starting url :return: starting url (str), sitemap (dict), links with issues (set) """ if self.start_url is None and start_url is None: raise ValueError("Start url cannot be None") if start_url is not None: self.start_url = start_url site_map = dict() visited = set() links_with_issues = set() queue = LifoQueue() queue.put(self.start_url) while not queue.empty(): next_link = queue.get() module_logger.info("Retrieved url=%s from queue" % next_link) try: if FileExtensionRule.apply(next_link): module_logger.info("Url=%s is a file asset" % next_link) continue except ValueError as err: module_logger.warn(err) try: # Create an absolute url from a relative url access_link = LinkHandler.reconstruct_link(self.start_url, next_link) if access_link is None: module_logger.warn("Currently working on next_link=%s - But access link value is None," "Something went wrong during the link construction" % next_link) links_with_issues.add(next_link) continue elif access_link in visited: module_logger.info("Already visited url=%s, skipping" % access_link) continue elif not DomainRule.apply(self.start_url, access_link): module_logger.info("url=%s is not in the same domain as %s" % (access_link, self.start_url)) continue else: module_logger.info("Going to access url=%s constructed from %s" % (access_link, next_link)) except ValueError as err: module_logger.warn(err) links_with_issues.add(next_link) continue except Exception as err: module_logger.error("An unexpected error during the link construction of url=%s" % next_link, err) links_with_issues.add(next_link) continue try: content = HttpHandler.fetch_url_content(access_link) if content is None: module_logger.warn("Unable to get content from link=%s" % access_link) continue except ValueError as err: module_logger.warn("Link=%s has a value issue, value current is %s" % (access_link, content), err) module_logger.exception(err) continue except Exception as err: module_logger.warn("Something unexpected happened while fetching content of the url=%s" % access_link) module_logger.exception(err) continue # Get links and assets from HTML page links, assets = PageParser.parse_page_get_links(content) module_logger.debug("Add link=%s into already visited list" % next_link) visited.add(next_link) module_logger.info("Extracted from url=%s - links=%s assets=%s" % (access_link, links, assets)) for link in links: if link not in visited: queue.put(link) module_logger.debug("Current link queue=%s" % str(queue)) site_map_record = {next_link: {'links': links, 'assets': assets}} module_logger.info("Adding record into site map=%s" % site_map_record) site_map.update(site_map_record) module_logger.info("Crawling completed.") module_logger.info("SiteMap=%s" % site_map) module_logger.info("Links with issues=%s" % links_with_issues) return self.start_url, site_map, links_with_issues