def parse_project_description(self, root): for node in root.xpath("//br"): node.tail = (node.tail or "") + "\n" text = strip_tags(decode_entities(render_html(root, encoding="unicode")), normalize_space=False) text = text.split(u"Posted On")[0].strip() text = text.split(u"Budget :")[0].strip() return text
def parse_project_description(self, root): for node in root.xpath('//br'): node.tail = (node.tail or '') + '\n' text = strip_tags(decode_entities(render_html(root, encoding='unicode')), normalize_space=False) text = text.split(u'Category:')[0].strip() return text
def text(self, default=NULL): try: return normalize_space(decode_entities(self.one().group(1))) except (AttributeError, IndexError): if default is NULL: raise else: return default
def rex_text_list(body, rex, flags=0): """ Return found matches with stripped tags. """ items = [] for match in rex_list(body, rex, flags=flags): items.append(normalize_space(decode_entities(match.group(1)))) return items
def rex_text(body, regexp, flags=0, default=NULL): """ Search `regexp` expression in `body` text and then strip tags in found result. """ match = rex(body, regexp, flags=flags, default=default) try: return normalize_space(decode_entities(match.group(1))) except AttributeError: if default is NULL: raise DataNotFound('Regexp not found') else: return default
def parse_projects(self, grab): res = [] for elem in grab.doc('//item'): desc_node = parse_html(elem.select('description').text()) res.append({ 'title': decode_entities(elem.select('title').text()), 'description': self.parse_project_description(desc_node), 'date': self.parse_date(elem.select('pubDate').text()), 'category': self.parse_category(desc_node), 'country': self.parse_country(desc_node), 'id': 'odesk-%s' % self.parse_id(desc_node), 'url': elem.select('link').text(), }) return res
def parse_search_results(grab, parse_index_size=False, anonymizer=False): """ Parse google search results page content. """ #elif grab.search(u'please type the characters below'): if grab.search(u'src="/sorry/image'): # Captcha!!! raise CaptchaError('Captcha found') elif anonymizer and grab.search(u'URL Error (0)'): # Common anonymizer error raise AnonymizerNetworkError('URL Error (0)') elif grab.css_exists('#ires'): if len(grab.css_list('#ires h3')): # Something was found if parse_index_size: index_size = parse_index_size(grab) else: index_size = None # Yield found results for elem in grab.css_list('h3.r a'): url = elem.get('href') if anonymizer: match = ANONYMIZER_ARG.search(url) if match: token = urllib.unquote(match.group(1)) url = decode_entities(base64.b64decode(token)) else: url = None logging.error( 'Could not parse url encoded by anonymizer') if url: yield { 'url': url, 'title': get_node_text(elem), 'index_size': index_size } else: pass #return [] else: raise ParsingError('Could not identify google page format')
def parse_projects(self, grab): res = [] for elem in grab.doc("//item"): desc_node = parse_html(elem.select("description").text()) res.append( { "title": decode_entities(elem.select("title").text()).replace(u" | Elance Job", u""), "description": self.parse_project_description(desc_node), "date": self.parse_date(elem.select("pubDate").text()), "category": self.parse_category(desc_node), "country": self.parse_country(desc_node), "id": "elance-%s" % self.parse_id(desc_node), "url": elem.select("link").text(), } ) return res
def parse_search_results(grab, parse_index_size=False, anonymizer=False): """ Parse google search results page content. """ #elif grab.search(u'please type the characters below'): if grab.search(u'src="/sorry/image'): # Captcha!!! raise CaptchaError('Captcha found') elif anonymizer and grab.search(u'URL Error (0)'): # Common anonymizer error raise AnonymizerNetworkError('URL Error (0)') elif grab.css_exists('#ires'): if len(grab.css_list('#ires h3')): # Something was found if parse_index_size: index_size = parse_index_size(grab) else: index_size = None # Yield found results for elem in grab.css_list('h3.r a'): url = elem.get('href') if anonymizer: match = ANONYMIZER_ARG.search(url) if match: token = urllib.unquote(match.group(1)) url = decode_entities(base64.b64decode(token)) else: url = None logging.error('Could not parse url encoded by anonymizer') if url: yield {'url': url, 'title': get_node_text(elem), 'index_size': index_size} else: pass #return [] else: raise ParsingError('Could not identify google page format')
def rex_text(self, regexp, flags=0, byte=False, default=NULL): """ Search regular expression in response body and return content of first matching group. :param byte: if False then search is performed in `response.unicode_body()` else the rex is searched in `response.body`. """ try: match = self.rex_search(regexp, flags=flags, byte=byte) except DataNotFound: if default is NULL: raise DataNotFound('Regexp not found') else: return default else: return normalize_space(decode_entities(match.group(1)))