def populate(options): start_engine(options) archives = html_parse(options.url).xpath('//td/a/text()') state = ChatState() state.parser.parse(archives, options) session.add_all(state.objects()) session.commit()
def parse_playlist_form(fp): show_date = None doc = html_parse(fp) inputs = {} for input in doc.getroot().body.xpath('//input[@type="text"]'): if input.attrib['name'] == 'date': show_date = date_parse(input.attrib['value']) elif PATTERN.match(input.attrib['name']): field,num = PATTERN.match(input.attrib['name']).groups() if field == 'timestamp': try: time = date_parse(input.attrib['value']) real_date = date_parse(show_date.strftime('%Y-%m-%d') + ' ' + time.strftime('%H:%M:%S')) d = inputs.setdefault(int(num),{}) d['unix_time'] = mktime(real_date.timetuple()) d['date_str'] = real_date.strftime('%Y-%m-%d') d['time_str'] = real_date.strftime('%H:%M:%S') except: pass # 'auto' probably else: try: value = input.attrib['value'] inputs.setdefault(int(num),{})[field] = value except KeyError: pass show = [] for k in sorted(inputs): d = dict((key,inputs[k].get(key,None)) for key in KEEPER_KEYS) if filter(None,d.values()): show.append(d) return (show[0]['date_str'],show)
def show_links(): a = CSSSelector('a').path doc = html_parse(urlopen(DJ_SEARCH_URL)) for link in doc.getroot().xpath(a): href = link.attrib.get('href') if href and href.startswith('../archive/?date='): yield urljoin(DJ_SEARCH_URL,href)
def _iter(self, fp, *args, **kwargs): """Read and parse an XML file to dict.""" # NOTE: We parse HTML, to skip XML validation and strip XML namespaces xml_tree = html_parse(fp).getroot() record = self._etree_to_dict(xml_tree)["html"]["body"].get("record") if not record: raise ReaderError(f"Record not found in XML entry.") yield record
def from_path(cls, path: Union[Path, str]) -> Topic: """Initialize function from path.""" return ( lambda parts: cls.from_etree( html_parse(str(path)), topic=parts[2], area=parts[0], theme=parts[1], ) )(Path(path).stem.split(" 分 "))
def html_tree(): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'fixtures', 'article.html') return html_parse(path)
def html_gallery_tree(): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "gallery.html") return html_parse(path)
def html_article_tree(): path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "article.html") return html_parse(path)