Ejemplo n.º 1
0
 def __init__(self, *sc_queues, **kwargs): 
     Process.__init__(self)
     SCSpider.Num += 1
     self.pnum = SCSpider.Num 
     if sc_queues:
         self.scqs = sc_queues
     else:
         self.scqs = list() 
     self.due = DUEUnit()
     self.link_extro = LinkExtractorTPool(feed=False)
     #The self.headers keeps the HTTP headers Agent information for Masking the Crawler
     self.headers = { 'User-Agent' : kwargs.pop("spider_spoof_id", None) }
     if self.headers['User-Agent'] == None: 
         self.headers = { 'User-Agent' : 'Mozilla/5.0 (X11; U; Linux x86_64; en-GB; rv:1.9.1.9)' }
     self.kill_evt = kwargs.pop("kill_evt", multiprocessing.Event().clear())
     self.urls_l = [ kwargs.pop("seed", None) ] 
     self.xtrees_q = kwargs.pop("xtrees_q", Queue()) #Use external Queue only for Interprocess Communication if any
     #ext_due_q is a Queue of URL Links for an External DUE-Unit 
     self.ext_url_q = kwargs.pop("ext_due_q", None)
     self.base_url_drop_none = kwargs.pop("base_url_drop_none", True)
     #urls_number_stop : Stop in a Default Values (if none given from user) for Politeness and because there is no point to have more samples of this site (I think)
     self.urls_number = kwargs.pop("urls_number_stop", 1000)  
     self.webpg_vect_tu = kwargs.pop("webpg_vect_tu", None)
     self.save_path = kwargs.pop("save_path", None)
     if self.save_path and not os.path.isdir(self.save_path):
         os.mkdir(self.save_path)
     self.file_counter = 0
Ejemplo n.º 2
0
    def __init__(self):
        self._shells = {
        }  # Keys are (username, shell_id) tuples. Each user has his/her own set of shell ids.
        self._command_by_short_name = {
        }  # Map each short name to its command (e.g. ["pig", "-l", "/dev/null"])
        self._meta = {}  # Map usernames to utils.UserMetadata objects
        self._greenlets_by_hid = {
        }  # Map each Hue Instance ID (HID) to greenlet currently fetching output for that HID.
        self._hids_by_pid = {
        }  # Map each process ID (PID) to the HID whose greenlet is currently doing a "select" on the process's output fd.
        self._greenlets_to_notify = {
        }  # For each PID, maintain a set of greenlets who are also interested in the output from that process, but are not doing the select.
        self._shells_by_fds = {
        }  # Map each file descriptor to the Shell instance whose output it represents.
        self._greenlet_interruptable = {
        }  # For each greenlet, store if it can be safely interrupted.
        self._env_by_short_name = {
        }  # Map each short name to a dictionary which contains the environment for shells of that type.

        self._delegation_token_dir = shell.conf.SHELL_DELEGATION_TOKEN_DIR.get(
        )
        if not os.path.exists(self._delegation_token_dir):
            os.mkdir(self._delegation_token_dir)

        self._parse_configs()
        eventlet.spawn_after(1, self._handle_periodic)
Ejemplo n.º 3
0
 def __init__(self, path=None):
     self.id = None
     self.base_url = dict() #Keeps the hash and the Base URL 
     self.seen = dict() #Keeps the URLs with or without the Base part
     self.filelist = list()
     self.conditonal_var = threading.Condition()
     self.green_pool = GreenPool(100)
     if path:
         self.filespath = path
     else:
         self.filespath = "/home/dimitrios/Documents/Synergy-Crawler/seen_urls/"
     if self.filespath and not os.path.isdir(self.filespath):
         os.mkdir(self.filespath)  
Ejemplo n.º 4
0
  def __init__(self):
    self._shells = {} # Keys are (username, shell_id) tuples. Each user has his/her own set of shell ids.
    self._command_by_short_name = {} # Map each short name to its command (e.g. ["pig", "-l", "/dev/null"])
    self._meta = {} # Map usernames to utils.UserMetadata objects
    self._greenlets_by_hid = {} # Map each Hue Instance ID (HID) to greenlet currently fetching output for that HID.
    self._hids_by_pid = {} # Map each process ID (PID) to the HID whose greenlet is currently doing a "select" on the process's output fd.
    self._greenlets_to_notify = {} # For each PID, maintain a set of greenlets who are also interested in the output from that process, but are not doing the select.
    self._shells_by_fds = {} # Map each file descriptor to the Shell instance whose output it represents.
    self._greenlet_interruptable = {} # For each greenlet, store if it can be safely interrupted.
    self._env_by_short_name = {} # Map each short name to a dictionary which contains the environment for shells of that type.

    self._delegation_token_dir = shell.conf.SHELL_DELEGATION_TOKEN_DIR.get()
    if not os.path.exists(self._delegation_token_dir):
      os.mkdir(self._delegation_token_dir)

    self._parse_configs()
    eventlet.spawn_after(1, self._handle_periodic)
Ejemplo n.º 5
0
  def __init__(self):
    self._shells = {} # Keys are (username, shell_id) tuples. Each user has his/her own set of shell ids.
    shell_types = [] # List of available shell types. For each shell type, we have a nice name (e.g. "Python Shell") and a short name (e.g. "python")
    self._command_by_short_name = {} # Map each short name to its command (e.g. ["pig", "-l", "/dev/null"])
    self._meta = {} # Map usernames to utils.UserMetadata objects
    self._greenlets_by_hid = {} # Map each Hue Instance ID (HID) to greenlet currently fetching output for that HID.
    self._hids_by_pid = {} # Map each process ID (PID) to the HID whose greenlet is currently doing a "select" on the process's output fd.
    self._greenlets_to_notify = {} # For each PID, maintain a set of greenlets who are also interested in the output from that process, but are not doing the select.
    self._shells_by_fds = {} # Map each file descriptor to the Shell instance whose output it represents.
    self._greenlet_interruptable = {} # For each greenlet, store if it can be safely interrupted.

    self._delegation_token_dir = shell.conf.SHELL_DELEGATION_TOKEN_DIR.get()
    if not os.path.exists(self._delegation_token_dir):
      os.mkdir(self._delegation_token_dir)

    for item in shell.conf.SHELL_TYPES.keys():
      command = shell.conf.SHELL_TYPES[item].command.get().strip().split()
      nice_name = shell.conf.SHELL_TYPES[item].nice_name.get().strip()
      executable_exists = utils.executable_exists(command)
      if executable_exists:
        self._command_by_short_name[item] = command
      shell_types.append({ constants.NICE_NAME: nice_name, constants.KEY_NAME: item, constants.EXISTS:executable_exists })
    self.shell_types = shell_types
    eventlet.spawn_after(1, self._handle_periodic)