def response(resp): results = [] dom = html.fromstring(resp.text) is_onion = True if 'onions' in categories else False if results_xpath: for result in eval_xpath(dom, results_xpath): url = extract_url(eval_xpath(result, url_xpath), search_url) title = extract_text(eval_xpath(result, title_xpath)) content = extract_text(eval_xpath(result, content_xpath)) tmp_result = {'url': url, 'title': title, 'content': content} # add thumbnail if available if thumbnail_xpath: thumbnail_xpath_result = eval_xpath(result, thumbnail_xpath) if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url( thumbnail_xpath_result, search_url) # add alternative cached url if available if cached_xpath: tmp_result['cached_url'] = cached_url + extract_text( result.xpath(cached_xpath)) if is_onion: tmp_result['is_onion'] = True results.append(tmp_result) else: if cached_xpath: for url, title, content, cached in zip( (extract_url(x, search_url) for x in dom.xpath(url_xpath)), map(extract_text, dom.xpath(title_xpath)), map(extract_text, dom.xpath(content_xpath)), map(extract_text, dom.xpath(cached_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'cached_url': cached_url + cached, 'is_onion': is_onion }) else: for url, title, content in zip( (extract_url(x, search_url) for x in dom.xpath(url_xpath)), map(extract_text, dom.xpath(title_xpath)), map(extract_text, dom.xpath(content_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'is_onion': is_onion }) if not suggestion_xpath: return results for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) return results
def test_extract_url(self): def f(html_str, search_url): return utils.extract_url(html.fromstring(html_str), search_url) self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/') self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/') self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/') self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/') self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1') with self.assertRaises(lxml.etree.ParserError): f('', 'https://example.com') with self.assertRaises(Exception): utils.extract_url([], 'https://example.com')
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): urls = result.xpath(url_xpath) if len(urls) != 1: continue url = sanitize_url(parse_url(extract_url(urls, search_url))) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) # parse publishedDate publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) # still useful ? if re.match("^[0-9]+ minute(s|) ago$", publishedDate): publishedDate = datetime.now() - timedelta( minutes=int(re.match(r'\d+', publishedDate).group())) elif re.match("^[0-9]+ days? ago$", publishedDate): publishedDate = datetime.now() - timedelta( days=int(re.match(r'\d+', publishedDate).group())) elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now()\ - timedelta(hours=int(timeNumbers[0]))\ - timedelta(minutes=int(timeNumbers[1])) else: try: publishedDate = parser.parse(publishedDate) except: publishedDate = datetime.now() if publishedDate.year == 1900: publishedDate = publishedDate.replace(year=datetime.now().year) # append result results.append({ 'url': url, 'title': title, 'content': content, 'publishedDate': publishedDate }) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) try: results_num = int( eval_xpath(dom, '//div[@class="compPagination"]/span[last()]/text()') [0].split()[0].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in eval_xpath(dom, results_xpath): try: url = parse_url( extract_url(eval_xpath(result, url_xpath), search_url)) title = extract_text(eval_xpath(result, title_xpath)[0]) except: continue content = extract_text(eval_xpath(result, content_xpath)[0]) # append result results.append({'url': url, 'title': title, 'content': content}) # if no suggestion found, return results suggestions = eval_xpath(dom, suggestion_xpath) if not suggestions: return results # parse suggestion for suggestion in suggestions: # append suggestion results.append({'suggestion': extract_text(suggestion)}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in eval_xpath_list(dom, results_xpath): single_result = {'template': template} for single_field in field_definition: single_field = {**default_field_settings, **single_field} try: if single_field['single_element']: node = eval_xpath(result, single_field['xpath']) else: node = eval_xpath_list(result, single_field['xpath']) if 'extract' in single_field and single_field[ 'extract'] == 'url': value = extract_url(node, search_url) elif 'extract' in single_field and single_field[ 'extract'] == 'boolean': value = (isinstance(node, list) and len(node) > 0) elif 'extract' in single_field and single_field[ 'extract'] == 'boolean_negate': value = (isinstance(node, list) and len(node) < 1) else: value = extract_text(node) single_result[single_field['field_name']] = value except Exception as e: logger.warning('error in resolving field %s:\n%s', single_field['field_name'], e) single_result[single_field['field_name']] = unresolvable_value results.append(single_result) return results
def response(resp): '''Scrap *results* from the response (see :ref:`engine results`). ''' results = [] dom = html.fromstring(resp.text) is_onion = 'onions' in categories # pylint: disable=undefined-variable if results_xpath: for result in eval_xpath_list(dom, results_xpath): url = extract_url(eval_xpath_list(result, url_xpath, min_len=1), search_url) title = extract_text( eval_xpath_list(result, title_xpath, min_len=1)) content = extract_text( eval_xpath_list(result, content_xpath, min_len=1)) tmp_result = {'url': url, 'title': title, 'content': content} # add thumbnail if available if thumbnail_xpath: thumbnail_xpath_result = eval_xpath_list( result, thumbnail_xpath) if len(thumbnail_xpath_result) > 0: tmp_result['img_src'] = extract_url( thumbnail_xpath_result, search_url) # add alternative cached url if available if cached_xpath: tmp_result['cached_url'] = (cached_url + extract_text( eval_xpath_list(result, cached_xpath, min_len=1))) if is_onion: tmp_result['is_onion'] = True results.append(tmp_result) else: if cached_xpath: for url, title, content, cached in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath)), map(extract_text, eval_xpath_list(dom, cached_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'cached_url': cached_url + cached, 'is_onion': is_onion }) else: for url, title, content in zip( (extract_url(x, search_url) for x in eval_xpath_list(dom, url_xpath)), map(extract_text, eval_xpath_list(dom, title_xpath)), map(extract_text, eval_xpath_list(dom, content_xpath))): results.append({ 'url': url, 'title': title, 'content': content, 'is_onion': is_onion }) if suggestion_xpath: for suggestion in eval_xpath(dom, suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) logger.debug("found %s results", len(results)) return results
def f(html_str, search_url): return utils.extract_url(html.fromstring(html_str), search_url)