def create_importer(page): importer = Importer(page=page, style='') resp = urlfetch.fetch(page.url, deadline=10) if resp.status_code == 200: soup = BeautifulSoup(resp.content) parser = CSSParser() for tag in soup.findAll(re.compile(r'^(link|style)$')): if tag.name == 'link': if tag.get('href', None) and tag.get('rel', 'stylesheet').lower() == 'stylesheet': url = urljoin(page.url, tag['href']) if urlparse(url).netloc != urlparse(request.url).netloc: importer.urls.append(url) elif tag.name == 'style': media = tag.get('media', None) sheet = parser.parseString(''.join(tag.contents).strip('\n'), href=url) style = sheet.cssText if media: style = '@media %s {\n%s\n}' % (media, style) style = '/* Imported directly from %s */\n%s\n' % (page.url, style) importer.style += style # Patch around AppEngine's frame inspection del parser importer.put() queue_import(page)
def do_import(): page = Page.get(request.form.get('page_key', '')) if not page or page.import_state != IMPORTING: return 'NO_IMPORTER' # We're done importer = Importer.gql('WHERE page=:1', page.key()).get() if not importer: # This requires a request to fetch the page and parse the URLs. # It also enqueues the next run. create_importer(page) return 'CREATED' if importer.urls: url = importer.urls.pop(0) parser = None try: resp = urlfetch.fetch(url, deadline=10) if resp.status_code == 200: parser = CSSParser() sheet = parser.parseString(resp.content, href=url) style = sheet.cssText importer.style += '\n\n/* Imported from %s */\n%s' % (url, style) else: raise Exception('Error fetching %s' % url) except Exception, e: import traceback importer.errors.append('Error importing %s' % url) logging.error('Error importing for Page %s from %s:\n%s\n%s', page.key().id(), url, e, traceback.format_exc()) finally:
def run(self, initialized): importer_model = Importer.get(id=self.importer_name, ignore=404) if importer_model is None: importer_model = Importer(_id=self.importer_name, last_updated=datetime(1900, 1, 1)) importer_model.save() try: for doc_type, doc in self.fetch(importer_model.last_updated): if doc_type == 'case': model = Case.get(id=doc['case_id'], ignore=404) if model is None: model = Case(_id=doc['case_id'], **doc) model.save() elif model.last_updated < dateutil.parser.parse(doc['last_updated']): model.update(**doc) elif doc_type == 'notice': if dateutil.parser.parse(doc['date_published']) > importer_model.last_updated or (initialized == False): case_id = doc['case_num']+doc['case_code'] existing_case = Case.get(id=case_id, ignore=404) if existing_case is None: case_doc = { 'case_id': case_id, 'last_updated': doc['last_updated'], 'address': doc['address'], 'name': 'NEWCASE' } new_case = Case(_id=case_id, **case_doc) new_case.save() model = Notice(**doc) model.meta.parent = case_id model.save() else: model = Notice(**doc) model.meta.parent = case_id model.save() else: continue importer_model.last_updated = doc['last_updated'] finally: importer_model.save()
def __init__(self, *args, **kwargs): method_path = kwargs['method_path'] self.raw_methods = self.load_raw_data(method_path) self.importer = Importer(db_name=kwargs['db_name'])