Esempio n. 1
0
 def parse_project_description(self, root):
     for node in root.xpath("//br"):
         node.tail = (node.tail or "") + "\n"
     text = strip_tags(decode_entities(render_html(root, encoding="unicode")), normalize_space=False)
     text = text.split(u"Posted On")[0].strip()
     text = text.split(u"Budget :")[0].strip()
     return text
Esempio n. 2
0
 def parse_project_description(self, root):
     for node in root.xpath('//br'):
         node.tail = (node.tail or '') + '\n'
     text = strip_tags(decode_entities(render_html(root,
                                                   encoding='unicode')),
                       normalize_space=False)
     text = text.split(u'Category:')[0].strip()
     return text
Esempio n. 3
0
 def text(self, default=NULL):
     try:
         return normalize_space(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise
         else:
             return default
Esempio n. 4
0
 def text(self, default=NULL):
     try:
         return normalize_space(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise
         else:
             return default
Esempio n. 5
0
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
Esempio n. 6
0
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
Esempio n. 7
0
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
Esempio n. 8
0
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
Esempio n. 9
0
 def parse_projects(self, grab):
     res = []
     for elem in grab.doc('//item'):
         desc_node = parse_html(elem.select('description').text())
         res.append({
             'title': decode_entities(elem.select('title').text()),
             'description': self.parse_project_description(desc_node),
             'date': self.parse_date(elem.select('pubDate').text()),
             'category': self.parse_category(desc_node),
             'country': self.parse_country(desc_node),
             'id': 'odesk-%s' % self.parse_id(desc_node),
             'url': elem.select('link').text(),
         })
     return res
Esempio n. 10
0
def parse_search_results(grab, parse_index_size=False, anonymizer=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaError('Captcha found')

    elif anonymizer and grab.search(u'URL Error (0)'):

        # Common anonymizer error
        raise AnonymizerNetworkError('URL Error (0)')

    elif grab.css_exists('#ires'):
        if len(grab.css_list('#ires h3')):

            # Something was found
            if parse_index_size:
                index_size = parse_index_size(grab)
            else:
                index_size = None

            # Yield found results
            for elem in grab.css_list('h3.r a'):
                url = elem.get('href')
                if anonymizer:
                    match = ANONYMIZER_ARG.search(url)
                    if match:
                        token = urllib.unquote(match.group(1))
                        url = decode_entities(base64.b64decode(token))
                    else:
                        url = None
                        logging.error(
                            'Could not parse url encoded by anonymizer')

                if url:
                    yield {
                        'url': url,
                        'title': get_node_text(elem),
                        'index_size': index_size
                    }
        else:
            pass
            #return []
    else:
        raise ParsingError('Could not identify google page format')
Esempio n. 11
0
 def parse_projects(self, grab):
     res = []
     for elem in grab.doc("//item"):
         desc_node = parse_html(elem.select("description").text())
         res.append(
             {
                 "title": decode_entities(elem.select("title").text()).replace(u" | Elance Job", u""),
                 "description": self.parse_project_description(desc_node),
                 "date": self.parse_date(elem.select("pubDate").text()),
                 "category": self.parse_category(desc_node),
                 "country": self.parse_country(desc_node),
                 "id": "elance-%s" % self.parse_id(desc_node),
                 "url": elem.select("link").text(),
             }
         )
     return res
Esempio n. 12
0
def parse_search_results(grab, parse_index_size=False, anonymizer=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaError('Captcha found')

    elif anonymizer and grab.search(u'URL Error (0)'):

        # Common anonymizer error
        raise AnonymizerNetworkError('URL Error (0)')

    elif grab.css_exists('#ires'):
        if len(grab.css_list('#ires h3')):

            # Something was found
            if parse_index_size:
                index_size = parse_index_size(grab)
            else:
                index_size = None

            # Yield found results
            for elem in grab.css_list('h3.r a'):
                url = elem.get('href')
                if anonymizer:
                    match = ANONYMIZER_ARG.search(url)
                    if match:
                        token = urllib.unquote(match.group(1))
                        url = decode_entities(base64.b64decode(token))
                    else:
                        url = None
                        logging.error('Could not parse url encoded by anonymizer')

                if url:
                    yield {'url': url, 'title': get_node_text(elem),
                           'index_size': index_size}
        else:
            pass
            #return []
    else:
        raise ParsingError('Could not identify google page format')
Esempio n. 13
0
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in
        `response.unicode_body()` else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))
Esempio n. 14
0
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in `response.unicode_body()`
            else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))