def _generate_index(source, logger): # Take contents of source, look for lists of tools and commands, # and insert tools and commands from bundles that come with # documentation from chimerax import app_dirs user_dir = os.path.join(app_dirs.user_cache_dir, 'docs', 'user') path = os.path.join(user_dir, 'index.html') if os.path.exists(path): return path os.makedirs(user_dir, exist_ok=True) from chimerax.core import toolshed ts = toolshed.get_toolshed() if ts is None: return None # Look for <div id="foobar"> import lxml.html html = lxml.html.parse(source) for node in html.iterfind(".//div[@id]"): ident = node.attrib["id"] if ident == "clist": _update_list(ts, node, 'commands', _update_commands, logger) elif ident == "tlist": _update_list(ts, node, 'tools', _update_tools, logger) data = lxml.html.tostring(html) os.makedirs(user_dir, exist_ok=True) with open(path, 'wb') as f: f.write(data) return path
def handle(self, **options): self.stdout.write('Loading file %s...' % options['html_file']) self.stdout.write('Save %s...' % options['save']) save = options['save'] #save = True html = lxml.html.parse(options['html_file']) tables = html.findall(".//table") self.stdout.write('nb tables %s' % len(tables)) for table in html.iterfind(".//table"): self.error_warning=0 # First child must be a caption self.caption = table[0] # Next row: parse brand name, url, and date row = self.caption.getnext() if row[0].get('colspan') is not '5': self.stdout.write('\tNot a brand table') continue if row[1].get('colspan') is not '5': self.stdout.write('\tNot a brand table') continue # Parse Brand self.parseBrand(row[1]) brand_note = None new_company = None if self.brand_note != []: brand_note = Note(note = '. '.join(self.brand_note)) self.stdout.write("\tNote: %s" % brand_note.note) if save: brand_note.save() if save: new_company = Company( name=self.caption.text, validation_date = self.date, note = brand_note, certification = ' / '.join(self.certification) ) new_company.save() #new_company = Company.objects.get( name=self.caption.text ) for name in self.brand_name.split('/'): if save: brand = Brand(name=name.strip(), company=new_company) brand.save() for url in self.url: url = url.strip('/') if save: new_site = Site(domain=url, company=new_company) new_site.save() # Next row must be for table header row = row.getnext() text = row.find("td").xpath("string()") if text != "Description": self.stdout.write(red+'Header table first column %s is not Description' % text+reset) # Parse products self.parseProducts(row, new_company, options['save'])
def iter_links(body): try: html = lxml.html.fromstring(body) except (lxml.etree.ParseError, lxml.etree.ParserError) as exc: logger.warn(exc) return for link in html.iterfind('.//a'): base = None href = link.attrib.get('href') if not href: continue while '../' in href: if '://' not in href: if base is None: try: base = html.find('.//base').attrib['href'] except BaseException: base = "" else: base = base.rstrip('/') + '/' if base: href = base + href.lstrip('/') href = '/' + href.split('://', 1)[1].split('/', 1)[-1] i = href.find('../') assert i > -1 if i == 0: continue previous = href.rfind('/', 0, i - 1) after = href[i + 3:] if previous == -1: href = after else: href = href[:previous] + "/" + after href = href.split('#', 1)[0] href = href.split('?', 1)[0] if href: yield href
def parse_html(self, url): page = url.split('articles/')[-1] if self.base_path.joinpath(page).exists(): html = lxml.html.parse(page) logging.info('HTML page `{}` exists, and parses.'.format(url)) # Dateline is in the first p, unless that is an image, then it is in the third. dateline = html.find('.//{*}p') if dateline.text is None: dateline = html.findall('.//{*}p')[2] if 'BLACKSBURG, Va.' in dateline.text: self.spatial_coverage = 'Blacksburg, Va.' else: date_issued = self.date_issued.strftime(', %b') self.spatial_coverage = dateline.text.split(date_issued)[0].title() if len(self.spatial_coverage) > 25 or '\n' in self.spatial_coverage or ' ' == self.spatial_coverage: # Sanity check: These are symptoms of errors. Change them to Blacksburg. self.spatial_coverage = 'Blacksburg, Va.' logging.debug('Spatial Coverage: {}'.format(self.spatial_coverage)) # Author is in the first li of the last ul, or the one before that, if it exists. html_lists = html.findall('.//{*}ul') author = html_lists[-1].find('./{*}li').text if author is None: try: author = html_lists[-2].find('./{*}li').text except IndexError as e: logging.error('No author found.') if author is not None: author = ' '.join(author.split()) self.author = author logging.debug('Author: {}'.format(self.author)) # Any img tag is a related file. for image in html.iterfind('.//{*}img'): self.image_urls.add(image.get('src')) if len(self.image_urls) > 0: logging.debug('All image urls: {}'.format(self.image_urls)) else: logging.error('Url `{}` does not map to an HTML file in the archive.'.format(url)) self.error_urls.add(url)