def test_extraction(self): samples_encoding = 'latin1' [(html1, data1), (html2, data2)] = list(iter_samples( 'scraper_loadstore', html_encoding=samples_encoding)) sc = Scraper() page1 = HtmlPage(body=html1, encoding=samples_encoding) sc.train_from_htmlpage(page1, data1) page2 = HtmlPage(body=html2, encoding=samples_encoding) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2) # check still works after serialize/deserialize f = StringIO() sc.tofile(f) f.seek(0) sc = Scraper.fromfile(f) extracted_data = sc.scrape_page(page2) self._assert_extracted(extracted_data, data2)
def main(): if len(sys.argv) < len(CLI_ARGS)+1: print "Usage:", sys.argv[0], " ".join(CLI_ARGS) exit() try: with open(sys.argv[1], 'r') as f: data_to_match = sys.argv[2] body = f.read() scraper = Scraper() from scrapely.template import FragmentNotFound try: decoded_body = univ_encode(body) scraper.train_from_htmlpage(HtmlPage(body=decoded_body), {'score': data_to_match}) print 0 except FragmentNotFound: print -1 return except IOError: print -2 return
class HTMLParser(BaseParser): ''' A parser that is able to parse html. ''' def __init__(self, **kwargs): super(HTMLParser, self).__init__(**kwargs) self.scrapely_parser = None for key, value in kwargs.items(): setattr(self, key, value) def _prepare_data(self, source): json_key = source.json_key data = source.data.decode('utf8') if json_key: # if the data is json, return it straightaway json_raw = json.loads(data) if hasattr(json_key, '__iter__') and json_key[0] in json_raw: data = reduce(dict.get, json_key, json_raw) elif type(json_key) == str and json_key in json_raw: data = json_raw[json_key] else: return False try: # Create an HTML object from the returned text. data = lxhtml.fromstring(data) except ValueError: # This happens when xml is declared in html. data = lxhtml.fromstring('\n'.join(data.split('\n')[1:])) except TypeError: print(data) print('Something weird has been returned by the server.') data.make_links_absolute(self.domain) return data def _get_selector(self, model): # assert len(model.selector) == 1, "Only one selector can be used." if model.selector: if type(model.selector) in (CSSSelector, XPath): return model.selector else: try: return CSSSelector(model.selector[0]) except SelectorSyntaxError: return XPath(model.selector[0]) except: raise Exception('Not a valid css or xpath selector', model.selector) return None def _apply_selector(self, selector, data): if selector: return selector(data) else: return (data, ) def _extract(self, html, template): # We have normal html if not template.js_regex: if html is not None: extracted = self._apply_selector(template.selector, html) else: extracted = [] # We want to extract a json_variable from the server else: regex = re.compile(template.js_regex) extracted = [] # Find all the scripts that match the regex. scripts = (regex.findall(s.text_content())[0] for s in html.cssselect('script') if regex.search(s.text_content())) # Set selected to the scripts for script in scripts: extracted.extend(json.loads(script)) return extracted def _source_from_object(self, objct, source): # TODO fix that the source object can determine for itself where data # or params should be placed in the object. new_source = objct.source._replicate() attrs = { attr.name: attr.value for attr in objct.attrs.values() if attr.name != 'url' } if not getattr(new_source, 'url', None): url = objct.attrs.get('url') if url and not isinstance(url, list): new_source.url = self.parent._apply_src_template( source, url.value) else: new_source.url = self.parent._apply_src_template( source, source.url) if new_source.copy_attrs: new_source = self._copy_attrs(objct, new_source) if new_source.parent: new_source.attrs['_parent'] = objct.attrs['url']._replicate() if new_source.method == 'post': new_source.data = {**new_source.data, **attrs} # noqa else: new_source.params = attrs self.parent._add_source(new_source) def _fallback(self, template, html, source): if not self.scrapely_parser: self.scrapely_parser = Scraper() html = self.scrapely_parser.HtmlPage(body=html) db_objct = self.db.read(uri, objct) if not db_objct: data = db_objct.attrs_to_dict() self.scrapely_parser.train_from_htmlpage(html, data) attr_dicts = self.scrapely_parser.scrape_page(html) for attr_dict in attr_dicts: objct = template._replicate(name=template.name, url=source.url) # Add the parsed values. objct.attrs_from_dict(attr_dict) yield objct return [] def _convert_to_element(self, parsed): elements = [] for p in parsed: if not type(p) == lxhtml.HtmlElement: elem = lxhtml.Element('p') elem.text = p elements.append(elem) return elements @add_other_doc(BaseParser.modify_text) def sel_text(self, elements, all_text=True, **kwargs): # noqa ''' Select all text for a given selector. ''' if all_text: text = [el.text_content() for el in elements] else: text = [el.text for el in elements] return self._sel_text(text, **kwargs) def sel_table(self, elements, columns: int = 2, offset: int = 0): ''' Parses a nxn table into a dictionary. Works best when the input is a td selector. Specify the amount of columns with the columns parameter. example: parse a 2x2 table {'func': sel_table, 'params': { 'selector': CSSSelector('table td'), 'columns': 2, 'offset': 0, } } leads to: sel_table(html=lxml.etree, selector=CSSSelector('table td'), columns=2, offset=0) ''' keys = [el.text for el in elements[offset::columns]] values = [el.text for el in elements[1::columns]] return dict(zip(keys, values)) def sel_row(self, elements, row_selector: int = None, value: str = '', attr=None, index=None): rows = [row for row in elements if value in row.text_contents()] if attr: selected = [ sel for sel in sel_attr(row, row_selector) for row in rows ] else: selected = [ sel for sel in sel_text(row, row_selector) for row in rows ] return self._value(selected, index) def sel_attr(self, elements, attr: str = '', **kwargs): ''' Extract an attribute of an HTML element. Will return a list of attributes if multiple tags match the selector. The **kwargs are the keyword arguments that can be added are from the BaseParser.modify_text method. ''' attrs = (el.attrib.get(attr) for el in elements) return self._sel_text(attrs, **kwargs) def sel_url(self, elements, index: int = None, **kwargs): return self.sel_attr(elements, attr='href', index=index, **kwargs) def sel_date(self, elements, fmt: str = 'YYYYmmdd', attr: str = None, index: int = None): ''' Returns a python date object with the specified format. ''' if attr: date = sel_attr(html, selector, attr=attr, index=index) else: date = sel_text(html, selector, index=index) if date: return datetime.strptime(date, fmt) def sel_exists(self, elements, key: str = '', index: int = None): ''' Return True if a keyword is in the selector text, ''' text = self.sel_text(elements) if text: if key in text: return True return False def sel_raw_html(self, elements): return [el.raw_html for el in elements] def sel_json(self, obj, selector, key=''): return obj.get(key) def sel_js_array(self, elements, var_name='', var_type=None): var_regex = 'var\s*' + var_name + '\s*=\s*(?:new Array\(|\[)(.*)(?:\)|\]);' array_string = self.sel_text(elements, regex=var_regex) if array_string: if var_type: return list(map(var_type, array_string.split(','))) return array_string.split(',') def fill_form(self, elements, fields={}, attrs=[]): for form in elements: data = {**dict(form.form_values()), **fields} source = Source(url=form.action, method=form.method, duplicate=True, attrs=attrs) if source.method == 'GET': source.params = data else: source.data = data self._add_source(source)
class TrendingSpider(Spider): name="trending_monitor" """docstring for TrendingSpider""" start_urls = [] PRINT_STATS_EVERY_X_CRAWLED_PAGES = 100 links_rule = None urls_seen = set() aborted = False crawled_all_pages = 0 score_field_text_negative_matches = [] def make_requests_from_url(self, url): return Request(url, dont_filter=True, meta={'start_url': url, 'metadepth': 0}) # rules = ( # Rule(SgmlLinkExtractor(allow=r'.+', deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|microsoft\.com|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)')), follow=False, callback='parse_item'), # ) def __init__(self, db_path, pid): print "\n===============================" * 2 print "Starting TrendingSpider... FOR PID=", pid print "\n===============================" * 2 self.project_id = int(pid) self.db_path = db_path self.fetch_project_data() if self.aborted: return print "Loaded", len(self.start_urls), "starting urls" self.start_time = time() self.crawled_pages = 0 # This has to be set after we run fetch_project_data() self.links_rule = Rule( SgmlLinkExtractor( allow='.+', deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)') ), follow=False, callback='parse_item' ) self.links_rule_targeted = Rule( SgmlLinkExtractor( allow=self.allow_regexp, deny=(r'.*(spampoison.*|cgi\/.*|accounts\.google\.com|login.*|\.(js|css|png|jpe?g|gif|bmp|tiff)(\?.*)?)') ), follow=False, callback='parse_item' ) super(TrendingSpider, self).__init__() dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self): print "Closing spider, crawled", self.crawled_pages if self.db is not None: self.db.commit() def done(self): return self.urls_limit is not 0 and (self.crawled_pages > self.urls_limit or self.crawled_all_pages > (self.urls_limit*10)) def parse(self, response): self.crawled_all_pages += 1 # This condition is there because even if we stopped adding new requests, we might still have more requests # done in total than the self.url_limits # why? Because we stop when we reached sefl.urls_limit in terms of _crawled_ urls and not in terms of URLs # added to the queue. This allows us to ensure we always crawl _at least_ self.urls_limit URLs but in return # we will most likely always crawl more than self.urls_limit because we will likely add new URLs before some # URLs in the queue (the queue having already reached the limit) have been fetched if self.done(): return if (self.crawled_pages % self.PRINT_STATS_EVERY_X_CRAWLED_PAGES) is 0: delta = time()-self.start_time print "Current crawl speed: ", self.crawled_pages, "urls crawled,", delta, "seconds,", self.crawled_pages / delta, "pages/second" if self.links_rule_targeted.link_extractor.matches(response.url): print "page targeted", response.url self.crawled_pages += 1 html_p = htmlpage_from_response(response) scraped_result = self.scraper.scrape_page(html_p) score = scraped_result[0]['score'][0] if self.score_field_text_negative_matches: for to_strip_off in self.score_field_text_negative_matches: score = score.replace(to_strip_off, '') print "\n===============================" * 2 print "score=", score print "\n===============================" * 2 item = ( response.url, score, int(time()) ) self.save_to_db(item) if self.done(): # wasting a little bit resources here because of ">" instead of ">=" return # We do not scrap the links, this time unique_new_links = set( [ l for l in self.links_rule.link_extractor.extract_links(response) if len(l.url) <= 255 and TrendingSpider.extract_domain(l.url) == self.our_domain ]) - self.urls_seen print "Got", len(unique_new_links), "new links" self.urls_seen |= unique_new_links return [Request(link.url) for link in unique_new_links] def save_to_db(self, item): self.db.execute('INSERT INTO result(TIMESTAMP, SCORE, PAGE, SEARCH_ID) VALUES(?, ?, ?, ?)', ( item[2], item[1], item[0], self.project_id ) ) self.db.commit() def init_db(self): import sqlite3 self.db = sqlite3.connect(self.db_path) def abort(self): sys.stderr.write("\n===============================" * 2) sys.stderr.write("\nSomething went wrong, aborting.") sys.stderr.write("\n===============================" * 2) self.start_urls = [] self.aborted = True def fetch_project_data(self): self.init_db() # Fetch data from DB test=str(self.project_id) c = self.db.execute('SELECT * FROM search WHERE id=?', (test,)) d = c.fetchone() if d is None: perr("No project found in DB") return self.abort() data_to_match = {'score': d[1]} body = d[2] url = d[3] self.our_domain = TrendingSpider.extract_domain(url) self.start_urls = [url] # This is one of the improvements we could implement from scrapely.template import FragmentNotFound try: self.setup_scraper(body, url, data_to_match) except FragmentNotFound: perr("Unable to learn from data") # We were not able to learn, cancel the crawl by having no start urls return self.abort() self.allow_regexp = d[5] self.urls_limit = int(d[6]) if d[7] != '' and d[7] is not None: self.score_field_text_negative_matches = d[7].split(d[8]) print "urls_limit=", self.urls_limit def setup_scraper(self, body, url, data_to_scrape): self.scraper = Scraper() decoded_body = univ_encode(body) self.scraper.train_from_htmlpage(HtmlPage(url=url, body=decoded_body), data_to_scrape) @staticmethod def extract_domain(url): try: url = url[url.index("//")+2:] # getting rid of protocol:// except ValueError: # There was no protocol specified pass try: url = url[:url.index("/")] # getting rid of everything after the first "/" except ValueError: # Maybe it was a domain-onl y url, with no "/" pass return url