Ejemplo n.º 1
0
    def get(self):

        flush_cache = urllib.unquote(
            cgi.escape(self.request.get('flushcache')).lower()[:50])
        reset_pos = urllib.unquote(
            cgi.escape(self.request.get('resetpos')).lower()[:50])
        get_stats = urllib.unquote(
            cgi.escape(self.request.get('getstats')).lower()[:50])

        if flush_cache:
            if not memcache.flush_all():
                logging.error("Error flushing memcache")
        if reset_pos:
            memcache.set("index", 1, 86400)
            SearchPosition(key_name="index", position=1).put()

        if get_stats:
            d = memcache.get_stats()
            #index = memcache.get("index")
            #if index:
            #d['indexmc'] = index
            #else:
            #d['indexmc'] = -1
            index_from_ds = SearchPosition.get_by_key_name("index")
            if index_from_ds:
                d['indexds'] = index_from_ds.position
            else:
                d['indexds'] = -1
            s = json.dumps(d)
            self.response.out.write(s)
        else:
            template_values = {}
            template = jinja_environment.get_template('html/admin_page.html')
            self.response.out.write(template.render(template_values))
Ejemplo n.º 2
0
  def get(self):
    index = memcache.get("index")

    if not index:
      index_from_ds = SearchPosition.get_by_key_name("index")
      if not index_from_ds:
        index = 1
        #Add it to datastore
        SearchPosition(key_name="index", position=index).put()
      else:
        index = index_from_ds.position
      memcache.add("index", index, 86400)
  
    #Spawn tasks
    for i in range(index, index + NUM_THREADS):
      taskqueue.add(url='/crawl/worker', params={'index': i}) #, target='backend'
      #Update Memcache
      if not memcache.incr("index"):
        logging.error("Memcache set failed")
    #crawlPerson(index)
      
    
    #Update Datastore
    index_from_ds = SearchPosition.get_by_key_name("index")
    if index_from_ds:
      index_from_ds.position = (index + NUM_THREADS)
    else:
      index_from_ds = SearchPosition(key_name="index", position=index)
    index_from_ds.put()
Ejemplo n.º 3
0
  def get(self):

    flush_cache = urllib.unquote(cgi.escape(self.request.get('flushcache')).lower()[:50])
    reset_pos = urllib.unquote(cgi.escape(self.request.get('resetpos')).lower()[:50])
    get_stats = urllib.unquote(cgi.escape(self.request.get('getstats')).lower()[:50])

    if flush_cache:
      if not memcache.flush_all():
        logging.error("Error flushing memcache")
    if reset_pos:
      memcache.set("index", 1, 86400)
      SearchPosition(key_name="index", position=1).put()



    if get_stats:
      d = memcache.get_stats()
      #index = memcache.get("index")
      #if index:
        #d['indexmc'] = index
      #else:
        #d['indexmc'] = -1
      index_from_ds = SearchPosition.get_by_key_name("index")
      if index_from_ds:
        d['indexds'] = index_from_ds.position
      else:
        d['indexds'] = -1
      s = json.dumps(d)
      self.response.out.write(s)
    else:
      template_values = {}
      path = os.path.join(os.path.dirname(__file__), 'admin_page.html')
      self.response.out.write(template.render(path, template_values))
Ejemplo n.º 4
0
def crawlPerson(index):
    logging.info("In CrawlPerson")

    if index:    
      result = Crawler().getMap(index)
      logging.info(str(result))
      putResult(result)
      return
    
    mutex = Mutex('mutex lock')
    try:
        mutex.lock()
        index_from_ds = SearchPosition.get_by_id("index")
        if index_from_ds:
            index = index_from_ds.position
        else:
            index_from_ds = SearchPosition(id='index',position=1)
            index_from_ds.put()
            index = 1
            
        result = Crawler().getMap(index)
        logging.info(str(result))
        
        if 'error' in result.keys():
            logging.warn("error at index" + str(index) + ", error is " + result['error'])
            if result['error'] == 'page_not_found':
                logging.warn("Invalid index: " + str(index))
                raise Exception()
            if result['error'] == 'end of database':
                logging.warn("Index out of range: " + str(index))
                index_from_ds.position = 1
                index_from_ds.put()
        else:
            logging.info("putting results")
    
            putResult(result)
    
            index_from_ds.position = (int(index) + 1)
            logging.info("INCREMENT " + str(index))
            index_from_ds.put()
            mutex.unlock()
    except Exception as e:
        raise e
    finally:
        mutex.unlock()
Ejemplo n.º 5
0
def crawlPerson(index):
    logging.info("In CrawlPerson")
    
    mutex = Mutex('mutex lock')
    try:
        mutex.lock()
        index_from_ds = SearchPosition.get_by_id("index")
        if index_from_ds:
            index = index_from_ds.position
        else:
            index_from_ds = SearchPosition(id='index',position=1)
            index_from_ds.put()
            index = 1
            
        result = Crawler().getMap(index)
        logging.info(str(result))
        
        if 'error' in result.keys():
            logging.warn("error at index" + str(index) + ", error is " + result['error'])
            if result['error'] == 'page_not_found':
                logging.warn("Invalid index: " + str(index))
                raise Exception()
            if result['error'] == 'end of database':
                logging.warn("Index out of range: " + str(index))
                index_from_ds.position = 1
                index_from_ds.put()
        else:
            logging.info("putting results")
    
            putResult(result)
    
            index_from_ds.position = (int(index) + 1)
            logging.info("INCREMENT " + str(index))
            index_from_ds.put()
            mutex.unlock()
    except Exception as e:
        raise e
    finally:
        mutex.unlock()