Beispiel #1
0
def main():
  index_dict = {}

  try:
    pkl_file = open(indexer.Indexer.filename, 'rb')
    index_dict = pickle.load(pkl_file)
    pkl_file.close()
  except IOError:
    print "Pickle file not found."

  indx = indexer.Indexer(index_dict)
  db_manager = dbmanager.dbmanager(DB_NAME)
  logging.basicConfig(filename = LOG_NAME, 
		      format='%(asctime)s:%(levelname)s:%(message)s',
		      filemode='w', level=logging.WARN)
  frontier = ['http://www.theonion.com','http://www.reddit.com','https://en.wikipedia.org/wiki/Satire']
  visited = {}
  domains = {}
  db_visited = db_manager.get_visited()
  db_frontier = db_manager.get_frontier()

  frontier += db_frontier
  #shuffle(frontier)

  for url in db_visited:
    print "Already visited: " + url
    visited[url] = 1

  current_threads = 0
  threads = []
  data = []
  t_urls = []
  
  for url in frontier:
    if visited.get(url, None):
      logging.info("Not requesting " + url + " because it has already been visited.")
      continue

    if domains.get(get_domain(url), 0) >= MAX_REQ_PER_DOMAIN:
      logging.info("Not requesting " + url + " because max requests per domain has been exceeded.")
      continue

    if is_blacklisted(url):
      logging.info("Not requesting " + url + " because it is blacklisted.")
      continue

    if(current_threads < MAX_THREADS):
      logging.info("Requesting " + url)
      print "Requesting " + url + " as t=" + str(current_threads)
      visited[url] = 1

      urldom = get_domain(url)
      if urldom in domains:
	domains[urldom] += 1
      else:
	domains[urldom] = 1

      d = []
      data.append(d)
      t_urls.append(url)
      t = Requester(url, TIME_LIMIT, d, MAX_SIZE_BYTES)
      t.start()
      threads.append(t)
      current_threads += 1

    if((current_threads >= MAX_THREADS) or (url == frontier[-1])):
      current_threads = 0
      for t in threads:
	t.join()

      for i in range(len(t_urls)):
	htmldata = ""
	if data[i]:
	  htmldata = data[i][0]
	db_manager.insert_visited(t_urls[i], len(htmldata))

	page_urls = list(set(get_urls(t_urls[i], htmldata)))
	indx.index_page(t_urls[i], htmldata)
	db_manager.insert_frontier(page_urls, t_urls[i])
	frontier += page_urls

      output_pkl = open(indexer.Indexer.filename, 'wb')
      pickle.dump(indx.index, output_pkl)
      output_pkl.close()

      threads = []
      data = []
      t_urls = []

  db_manager.close()
Beispiel #2
0
 def _run_requesters(self):
     for service in self.urls_config['urls']:
         t = Requester(service['url'], service['delay'], self.dao)
         self.threads.append(t)
         t.start()
Beispiel #3
0
class Worker():
    def __init__(self, host: str, port: int, config: dict, tbb_path):
        """
        Stores the work queue URL and various configuration options
        """
        # Create the work url
        self.work_url = "http://{}:{}".format(host, port)
        # Store number of times work is completed per type
        self.work_type_counts = {'normal': 0, 'tor': 0}
        # Create a requests session
        self.session = requests.Session()
        # Disable keepalive
        self.session.keep_alive = False
        # Get a logger
        self.logger = logging.getLogger()
        # Store given config
        self.config = config
        self.tbb_path = tbb_path
        # Initialize members that will be created later
        self.client_id = None
        self.tcpdump = None
        self.proxy = None
        self.requester = None

    def __enter__(self):
        """
        Ensures server is available
        Requests and stores a client ID from the server
        Gets a connection to the tcpdump daemon
        :throws Exception: if the client ID request fails
        """
        # Send requests to the URLs service until the status
        # page returns a response
        waiting = True
        while waiting:
            try:
                self.logger.info("Attempting to contact work queue")
                self.session.get("{}/status".format(self.work_url))
                waiting = False
            except Exception as _:
                self.logger.info(
                    "Attempt to contact work queue failed. Retrying")
        # Request a client ID
        # TODO: look into renaming this "register"
        self.logger.info("Registering client with server")
        # TODO: work types as part of config
        response = self.session.post(
            "{}/client/add".format(self.work_url),
            json={'work_types': ['tor', 'normal']})
        # Parse response as json
        response = response.json()
        # Extract client id from response
        if response['success']:
            self.client_id = response['client_id']
        else:
            raise Exception(response['error'])
        # Start up a connection to the tcpdump daemon
        # TODO: parameterize socket path
        self.tcpdump = TcpDump('/tmp/tcpdump.socket')
        # Instantiate proxy object
        self.proxy = Proxy(self.tbb_path, self.config["tor"])
        # Instantiate requester object
        self.requester = Requester(self.config["firefox"],
                                   self.config["tor"]["port"])
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """
        Informs the server that the client has stopped
        :param exc_type:
        :param exc_value:
        :param traceback:
        """
        # If the program completed without error
        if exc_type is None:
            self.logger.info("Worker program finished without error")
        else:
            # Log the error
            self.logger.error("%s %s %s", exc_type, exc_value, traceback)
        # Indicate to the server that the client has stopped
        self.logger.info("Deregistering client from server")
        self.session.post(
            "{}/client/remove".format(self.work_url),
            json={'client_id': self.client_id})
        # Stop the tcpdump daemon
        self.tcpdump.shutdown()

    def request_work(self):
        """
        Requests a piece of work from the server
        """
        # Make a request to the server to get a URL to navigate to
        try:
            # Make a request for work
            response = self.session.post(
                "{}/work/get".format(self.work_url),
                json={'client_id': self.client_id})
            # 204 means no more URLs
            if response.status_code == 204:
                self.logger.info("No more URLs")
                return None
            # This will throw an exception if it fails, which is handled below
            work = response.json()
            return work
        except Exception as exc:
            self.logger.error("Failed to request work: %s", exc)
            return None

    def perform_work(self, work: dict):
        """
        Performs a piece of work given by the server
        :param work: work as received from the server
        """
        # Extract required variables from work
        mode = work["work_type"]
        # Once type is extracted, limit the scope of work
        work = work["work"]
        filename = work["filename"]
        url = "https://{}".format(work["url"])
        global_index = work["index"]
        # Set work type counter
        self.work_type_counts[mode] += 1
        # Scoped variables set inside try block
        error = None
        fatal = False
        # Store timestamp
        start_time = int(time.time() * 1e9)
        try:
            # Start packet capture
            self.tcpdump.start(filename)
            # Start proxy
            self.proxy.start(mode)
            # Start requester
            self.requester.start(mode)

            # Perform request in requester
            self.logger.info(
                "Navigating to %s in %s mode (local: %d) (global: %d)", url,
                mode, self.work_type_counts[mode], global_index)
            self.requester.request(url)

            # End requester
            self.requester.stop()
            # End proxy
            self.proxy.stop()
            # End packet capture
            self.tcpdump.stop()
        except TcpDumpError as err:
            self.logger.error(str(err))
            error = err
            fatal = True
        except Exception as err:
            self.logger.error(str(err))
            error = err
        # Store ending timestamp
        finish_time = int(time.time() * 1e9)
        # Create report
        report = {
            'success': error is None,
            'work_type': mode,
            'work': work,
            'type_index': self.work_type_counts[mode],
            'start_time': start_time,
            'finish_time': finish_time,
            # This will be stripped
            'fatal': fatal
        }
        # Store the error if given
        if error is not None:
            report['error'] = str(error)
        # Return report
        return report

    def send_report(self, report: dict):
        # Stringify error
        if 'error' in report:
            report['error'] = str(report['error'])
        # Send the report
        self.session.post("{}/work/report".format(self.work_url), json=report)
        # FIXME: Make a dummy request to the server. to enforce the shutdown
        # Allow this to fail
        try:
            self.session.post("{}/status".format(self.work_url))
        except:
            pass
Beispiel #4
0
def main():
    index_dict = {}

    try:
        pkl_file = open(indexer.Indexer.filename, "rb")
        index_dict = pickle.load(pkl_file)
        pkl_file.close()
    except IOError:
        print "Pickle file not found."

    indx = indexer.Indexer(index_dict)
    db_manager = dbmanager.dbmanager(DB_NAME)
    logging.basicConfig(
        filename=LOG_NAME, format="%(asctime)s:%(levelname)s:%(message)s", filemode="w", level=logging.WARN
    )
    frontier = ["http://www.theonion.com", "http://www.reddit.com", "https://en.wikipedia.org/wiki/Satire"]
    visited = {}
    domains = {}
    db_visited = db_manager.get_visited()
    db_frontier = db_manager.get_frontier()

    frontier += db_frontier
    # shuffle(frontier)

    for url in db_visited:
        print "Already visited: " + url
        visited[url] = 1

    current_threads = 0
    threads = []
    data = []
    t_urls = []

    for url in frontier:
        if visited.get(url, None):
            logging.info("Not requesting " + url + " because it has already been visited.")
            continue

        if domains.get(get_domain(url), 0) >= MAX_REQ_PER_DOMAIN:
            logging.info("Not requesting " + url + " because max requests per domain has been exceeded.")
            continue

        if is_blacklisted(url):
            logging.info("Not requesting " + url + " because it is blacklisted.")
            continue

        if current_threads < MAX_THREADS:
            logging.info("Requesting " + url)
            print "Requesting " + url + " as t=" + str(current_threads)
            visited[url] = 1

            urldom = get_domain(url)
            if urldom in domains:
                domains[urldom] += 1
            else:
                domains[urldom] = 1

            d = []
            data.append(d)
            t_urls.append(url)
            t = Requester(url, TIME_LIMIT, d, MAX_SIZE_BYTES)
            t.start()
            threads.append(t)
            current_threads += 1

        if (current_threads >= MAX_THREADS) or (url == frontier[-1]):
            current_threads = 0
            for t in threads:
                t.join()

            for i in range(len(t_urls)):
                htmldata = ""
                if data[i]:
                    htmldata = data[i][0]
                db_manager.insert_visited(t_urls[i], len(htmldata))

                page_urls = list(set(get_urls(t_urls[i], htmldata)))
                indx.index_page(t_urls[i], htmldata)
                db_manager.insert_frontier(page_urls, t_urls[i])
                frontier += page_urls

            output_pkl = open(indexer.Indexer.filename, "wb")
            pickle.dump(indx.index, output_pkl)
            output_pkl.close()

            threads = []
            data = []
            t_urls = []

    db_manager.close()