def test_extraction(self): samples_encoding = 'latin1' [(html1, data1), (html2, data2)] = list(iter_samples( 'scraper_loadstore', html_encoding=samples_encoding)) sc = Scraper() page1 = HtmlPage(body=html1, encoding=samples_encoding) sc.train_from_htmlpage(page1, data1) page2 = HtmlPage(body=html2, encoding=samples_encoding) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2) # check still works after serialize/deserialize f = StringIO() sc.tofile(f) f.seek(0) sc = Scraper.fromfile(f) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2)
class Depta(object): def __init__(self, threshold=0.75, k=5): self.threshold = threshold self.k = k self.scraper = Scraper() def extract(self, html='', **kwargs): """ extract data field from raw html or from a url. """ if not html and 'url' in kwargs: info = urlopen(kwargs.pop('url')) _, html = html_to_unicode(info.headers.get('content_type'), info.read()) builder = DomTreeBuilder(html) root = builder.build() region_finder = MiningDataRegion(root, self.k, self.threshold) regions = region_finder.find_regions(root) record_finder = MiningDataRecord(self.threshold) field_finder = MiningDataField() for region in regions: records = record_finder.find_records(region) items, _ = field_finder.align_records(records) region.items = items if 'verbose' in kwargs: print region for record in records: print '\t', record return regions def train(self, seed, data): """ train scrapely from give seed region and data. """ assert data, "Cannot train with empty data" htmlpage = self._region_to_htmlpage(seed) tm = TemplateMaker(htmlpage) if isinstance(data, dict): data = data.items() for field, values in data: if not hasattr(values, '__iter__'): values = [values] for value in values: if isinstance(value, str): value = value.decode(htmlpage.encoding or 'utf-8') tm.annotate(field, best_match(value), best_match=False) self.scraper.add_template(tm.get_template()) def infer(self, html='', **kwargs): """ extract data with seed region and the data you expect to scrape from there. """ if 'url' in kwargs: info = urlopen(kwargs.pop('url')) _, html = html_to_unicode(info.headers.get('content_type'), info.read()) builder = DomTreeBuilder(html) doc = builder.build() page = HtmlPage(body=tostring(doc, encoding=unicode, method='html')) return self.scraper.scrape_page(page) def _region_to_htmlpage(self, region): seed_body = tostring(region.parent[region.start], encoding=unicode, method='html') return HtmlPage(body=seed_body)
class HTMLParser(BaseParser): ''' A parser that is able to parse html. ''' def __init__(self, **kwargs): super(HTMLParser, self).__init__(**kwargs) self.scrapely_parser = None for key, value in kwargs.items(): setattr(self, key, value) def _prepare_data(self, source): json_key = source.json_key data = source.data.decode('utf8') if json_key: # if the data is json, return it straightaway json_raw = json.loads(data) if hasattr(json_key, '__iter__') and json_key[0] in json_raw: data = reduce(dict.get, json_key, json_raw) elif type(json_key) == str and json_key in json_raw: data = json_raw[json_key] else: return False try: # Create an HTML object from the returned text. data = lxhtml.fromstring(data) except ValueError: # This happens when xml is declared in html. data = lxhtml.fromstring('\n'.join(data.split('\n')[1:])) except TypeError: print(data) print('Something weird has been returned by the server.') data.make_links_absolute(self.domain) return data def _get_selector(self, model): # assert len(model.selector) == 1, "Only one selector can be used." if model.selector: if type(model.selector) in (CSSSelector, XPath): return model.selector else: try: return CSSSelector(model.selector[0]) except SelectorSyntaxError: return XPath(model.selector[0]) except: raise Exception('Not a valid css or xpath selector', model.selector) return None def _apply_selector(self, selector, data): if selector: return selector(data) else: return (data, ) def _extract(self, html, template): # We have normal html if not template.js_regex: if html is not None: extracted = self._apply_selector(template.selector, html) else: extracted = [] # We want to extract a json_variable from the server else: regex = re.compile(template.js_regex) extracted = [] # Find all the scripts that match the regex. scripts = (regex.findall(s.text_content())[0] for s in html.cssselect('script') if regex.search(s.text_content())) # Set selected to the scripts for script in scripts: extracted.extend(json.loads(script)) return extracted def _source_from_object(self, objct, source): # TODO fix that the source object can determine for itself where data # or params should be placed in the object. new_source = objct.source._replicate() attrs = { attr.name: attr.value for attr in objct.attrs.values() if attr.name != 'url' } if not getattr(new_source, 'url', None): url = objct.attrs.get('url') if url and not isinstance(url, list): new_source.url = self.parent._apply_src_template( source, url.value) else: new_source.url = self.parent._apply_src_template( source, source.url) if new_source.copy_attrs: new_source = self._copy_attrs(objct, new_source) if new_source.parent: new_source.attrs['_parent'] = objct.attrs['url']._replicate() if new_source.method == 'post': new_source.data = {**new_source.data, **attrs} # noqa else: new_source.params = attrs self.parent._add_source(new_source) def _fallback(self, template, html, source): if not self.scrapely_parser: self.scrapely_parser = Scraper() html = self.scrapely_parser.HtmlPage(body=html) db_objct = self.db.read(uri, objct) if not db_objct: data = db_objct.attrs_to_dict() self.scrapely_parser.train_from_htmlpage(html, data) attr_dicts = self.scrapely_parser.scrape_page(html) for attr_dict in attr_dicts: objct = template._replicate(name=template.name, url=source.url) # Add the parsed values. objct.attrs_from_dict(attr_dict) yield objct return [] def _convert_to_element(self, parsed): elements = [] for p in parsed: if not type(p) == lxhtml.HtmlElement: elem = lxhtml.Element('p') elem.text = p elements.append(elem) return elements @add_other_doc(BaseParser.modify_text) def sel_text(self, elements, all_text=True, **kwargs): # noqa ''' Select all text for a given selector. ''' if all_text: text = [el.text_content() for el in elements] else: text = [el.text for el in elements] return self._sel_text(text, **kwargs) def sel_table(self, elements, columns: int = 2, offset: int = 0): ''' Parses a nxn table into a dictionary. Works best when the input is a td selector. Specify the amount of columns with the columns parameter. example: parse a 2x2 table {'func': sel_table, 'params': { 'selector': CSSSelector('table td'), 'columns': 2, 'offset': 0, } } leads to: sel_table(html=lxml.etree, selector=CSSSelector('table td'), columns=2, offset=0) ''' keys = [el.text for el in elements[offset::columns]] values = [el.text for el in elements[1::columns]] return dict(zip(keys, values)) def sel_row(self, elements, row_selector: int = None, value: str = '', attr=None, index=None): rows = [row for row in elements if value in row.text_contents()] if attr: selected = [ sel for sel in sel_attr(row, row_selector) for row in rows ] else: selected = [ sel for sel in sel_text(row, row_selector) for row in rows ] return self._value(selected, index) def sel_attr(self, elements, attr: str = '', **kwargs): ''' Extract an attribute of an HTML element. Will return a list of attributes if multiple tags match the selector. The **kwargs are the keyword arguments that can be added are from the BaseParser.modify_text method. ''' attrs = (el.attrib.get(attr) for el in elements) return self._sel_text(attrs, **kwargs) def sel_url(self, elements, index: int = None, **kwargs): return self.sel_attr(elements, attr='href', index=index, **kwargs) def sel_date(self, elements, fmt: str = 'YYYYmmdd', attr: str = None, index: int = None): ''' Returns a python date object with the specified format. ''' if attr: date = sel_attr(html, selector, attr=attr, index=index) else: date = sel_text(html, selector, index=index) if date: return datetime.strptime(date, fmt) def sel_exists(self, elements, key: str = '', index: int = None): ''' Return True if a keyword is in the selector text, ''' text = self.sel_text(elements) if text: if key in text: return True return False def sel_raw_html(self, elements): return [el.raw_html for el in elements] def sel_json(self, obj, selector, key=''): return obj.get(key) def sel_js_array(self, elements, var_name='', var_type=None): var_regex = 'var\s*' + var_name + '\s*=\s*(?:new Array\(|\[)(.*)(?:\)|\]);' array_string = self.sel_text(elements, regex=var_regex) if array_string: if var_type: return list(map(var_type, array_string.split(','))) return array_string.split(',') def fill_form(self, elements, fields={}, attrs=[]): for form in elements: data = {**dict(form.form_values()), **fields} source = Source(url=form.action, method=form.method, duplicate=True, attrs=attrs) if source.method == 'GET': source.params = data else: source.data = data self._add_source(source)