Exemple #1
0
def create_importer(page):
    importer = Importer(page=page, style='')
    resp = urlfetch.fetch(page.url, deadline=10)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content)
        parser = CSSParser()
        for tag in soup.findAll(re.compile(r'^(link|style)$')):
            if tag.name == 'link':
                if tag.get('href', None) and tag.get('rel', 'stylesheet').lower() == 'stylesheet':
                    url = urljoin(page.url, tag['href'])
                    if urlparse(url).netloc != urlparse(request.url).netloc:
                        importer.urls.append(url)
            elif tag.name == 'style':
                media = tag.get('media', None)
                sheet = parser.parseString(''.join(tag.contents).strip('\n'), href=url)
                style = sheet.cssText
                if media:
                    style = '@media %s {\n%s\n}' % (media, style)
                style = '/* Imported directly from %s */\n%s\n' % (page.url, style)
                importer.style += style
        # Patch around AppEngine's frame inspection
        del parser

        importer.put()
        queue_import(page)
Exemple #2
0
def do_import():
    page = Page.get(request.form.get('page_key', ''))
    if not page or page.import_state != IMPORTING:
        return 'NO_IMPORTER' # We're done
    importer = Importer.gql('WHERE page=:1', page.key()).get()
    if not importer:
        # This requires a request to fetch the page and parse the URLs.
        # It also enqueues the next run.
        create_importer(page)
        return 'CREATED'
    if importer.urls:
        url = importer.urls.pop(0)
        parser = None
        try:
            resp = urlfetch.fetch(url, deadline=10)
            if resp.status_code == 200:
                parser = CSSParser()
                sheet = parser.parseString(resp.content, href=url)
                style = sheet.cssText
                importer.style += '\n\n/* Imported from %s */\n%s' % (url, style)
            else:
                raise Exception('Error fetching %s' % url)
        except Exception, e:
            import traceback
            importer.errors.append('Error importing %s' % url)
            logging.error('Error importing for Page %s from %s:\n%s\n%s', page.key().id(), url, e, traceback.format_exc())
        finally:
Exemple #3
0
    def run(self, initialized):
        importer_model = Importer.get(id=self.importer_name, ignore=404)
        if importer_model is None:
            importer_model = Importer(_id=self.importer_name, last_updated=datetime(1900, 1, 1))
            importer_model.save()

        try:
            for doc_type, doc in self.fetch(importer_model.last_updated):
                if doc_type == 'case':
                    model = Case.get(id=doc['case_id'], ignore=404)
                    if model is None:
                        model = Case(_id=doc['case_id'], **doc)
                        model.save()
                    elif model.last_updated < dateutil.parser.parse(doc['last_updated']):
                        model.update(**doc)
                elif doc_type == 'notice':
                    if dateutil.parser.parse(doc['date_published']) > importer_model.last_updated or (initialized == False):
                        case_id = doc['case_num']+doc['case_code']
                        existing_case = Case.get(id=case_id, ignore=404)
                        if existing_case is None:
                            case_doc = {
                                'case_id': case_id,
                                'last_updated': doc['last_updated'],
                                'address': doc['address'],
                                'name': 'NEWCASE'
                            }
                            new_case = Case(_id=case_id, **case_doc)
                            new_case.save()
                            model = Notice(**doc)
                            model.meta.parent = case_id
                            model.save()
                        else:
                            model = Notice(**doc)
                            model.meta.parent = case_id
                            model.save()
                else:
                    continue

                importer_model.last_updated = doc['last_updated']
        finally:
            importer_model.save()
Exemple #4
0
 def __init__(self, *args, **kwargs):
     method_path = kwargs['method_path']
     self.raw_methods = self.load_raw_data(method_path)
     self.importer = Importer(db_name=kwargs['db_name'])