Example #1
0
 def seed(self):
     for sitedef in self.db.sitedefs.find():
         hostname = None
         try:
             if not u'allowed_links' in sitedef or \
                not u'syntax' in sitedef or \
                not u'start_url' in sitedef or \
                not sitedef[u'start_url']:
                 logger.log_error('Bad site definition: ' + str(sitedef))
                 continue
             hostname = urlparse(sitedef[u'start_url']).netloc
             self.collection.insert({
                 'timestamp' : datetime.datetime.utcnow(),
                 'urls' : [sitedef[u'start_url']],
                 'hostname' : hostname,
                 'sitedef_id' : sitedef[u'_id']
             })
         except pymongo.errors.DuplicateKeyError:
             logger.log_warning('Duplicate key error : %s' % (str(sitedef)))
             if not hostname:
                 continue
             self.collection.update({
                 'hostname' : hostname
             }, {
                 '$addToSet' : {
                     'urls' : sitedef[u'start_url']
                 },
                 '$set' : {
                     'timestamp' : datetime.datetime.utcnow()
                 }
             })
             continue
Example #2
0
 def release(self, id, next_start=None, start_offset=None):
     if id == None:
         return None
     if next_start == None and start_offset == None:
         logger.log_warning('Release invoked without valid start time, noop')
         return
     start_time = datetime.datetime.utcnow()
     if next_start:
         start_time = next_start
     if start_offset:
         start_time = start_time + datetime.timedelta(seconds=start_offset)
     with MongoScopeGuard():
         self.collection.update({
             '_id' : id
         },{
             '$set' : {
                 'timestamp' : start_time
             }
         })
Example #3
0
def parse_syntax(body, syntax, url, container=None, soup=None):
    if not syntax:
        return None
    if not soup:
        soup = BeautifulSoup(body, 'lxml')
    result = {}
    for key in syntax:
        #keyword is reserved
        if key == 'container':
            pass
        elif (type(syntax[key]) == str or type(syntax[key]) == unicode) and len(syntax[key].strip()):
            selector = '%s' % (syntax[key])
            try:
                nodes = soup.select(selector)
            except IndexError, e:
                logger.log_error('[%s] IndexError with selector %s %s' % (url, selector, e))
                continue
            if len(nodes) == 0:
                logger.log_warning('[%s] Could not find selector %s' % (url, str(selector)))
                continue
            node = nodes[0]
            val = node.get_text(' ', strip=True)
            if not val or val == '':
                #fall back
                if 'value' in node:
                    val = val[u'value']
                #elif ...:

            result[key] = {
                'text' : val
            }
            if node.has_attr('href'):
                if url:
                    result[key]['href'] = urlparse.urljoin(url, node['href'])
                else:
                    result[key]['href'] = node['href']
        elif type(syntax[key]) == dict:
            result[key] = parse_syntax(body, syntax[key], url, soup=soup, container=container)