コード例 #1
0
ファイル: spider.py プロジェクト: khanhnnvn/job_monitor
 def parse_project_description(self, root):
     for node in root.xpath("//br"):
         node.tail = (node.tail or "") + "\n"
     text = strip_tags(decode_entities(render_html(root, encoding="unicode")), normalize_space=False)
     text = text.split(u"Posted On")[0].strip()
     text = text.split(u"Budget :")[0].strip()
     return text
コード例 #2
0
 def parse_project_description(self, root):
     for node in root.xpath('//br'):
         node.tail = (node.tail or '') + '\n'
     text = strip_tags(decode_entities(render_html(root,
                                                   encoding='unicode')),
                       normalize_space=False)
     text = text.split(u'Category:')[0].strip()
     return text
コード例 #3
0
 def text(self, default=NULL):
     try:
         return normalize_space(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise
         else:
             return default
コード例 #4
0
ファイル: selector.py プロジェクト: Scaurus/grab
 def text(self, default=NULL):
     try:
         return normalize_space(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise
         else:
             return default
コード例 #5
0
ファイル: rex.py プロジェクト: Kuznitsin/grab
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
コード例 #6
0
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
コード例 #7
0
ファイル: rex.py プロジェクト: Kuznitsin/grab
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
コード例 #8
0
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
コード例 #9
0
 def parse_projects(self, grab):
     res = []
     for elem in grab.doc('//item'):
         desc_node = parse_html(elem.select('description').text())
         res.append({
             'title': decode_entities(elem.select('title').text()),
             'description': self.parse_project_description(desc_node),
             'date': self.parse_date(elem.select('pubDate').text()),
             'category': self.parse_category(desc_node),
             'country': self.parse_country(desc_node),
             'id': 'odesk-%s' % self.parse_id(desc_node),
             'url': elem.select('link').text(),
         })
     return res
コード例 #10
0
def parse_search_results(grab, parse_index_size=False, anonymizer=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaError('Captcha found')

    elif anonymizer and grab.search(u'URL Error (0)'):

        # Common anonymizer error
        raise AnonymizerNetworkError('URL Error (0)')

    elif grab.css_exists('#ires'):
        if len(grab.css_list('#ires h3')):

            # Something was found
            if parse_index_size:
                index_size = parse_index_size(grab)
            else:
                index_size = None

            # Yield found results
            for elem in grab.css_list('h3.r a'):
                url = elem.get('href')
                if anonymizer:
                    match = ANONYMIZER_ARG.search(url)
                    if match:
                        token = urllib.unquote(match.group(1))
                        url = decode_entities(base64.b64decode(token))
                    else:
                        url = None
                        logging.error(
                            'Could not parse url encoded by anonymizer')

                if url:
                    yield {
                        'url': url,
                        'title': get_node_text(elem),
                        'index_size': index_size
                    }
        else:
            pass
            #return []
    else:
        raise ParsingError('Could not identify google page format')
コード例 #11
0
ファイル: spider.py プロジェクト: khanhnnvn/job_monitor
 def parse_projects(self, grab):
     res = []
     for elem in grab.doc("//item"):
         desc_node = parse_html(elem.select("description").text())
         res.append(
             {
                 "title": decode_entities(elem.select("title").text()).replace(u" | Elance Job", u""),
                 "description": self.parse_project_description(desc_node),
                 "date": self.parse_date(elem.select("pubDate").text()),
                 "category": self.parse_category(desc_node),
                 "country": self.parse_country(desc_node),
                 "id": "elance-%s" % self.parse_id(desc_node),
                 "url": elem.select("link").text(),
             }
         )
     return res
コード例 #12
0
ファイル: google.py プロジェクト: averrin/demesne_old
def parse_search_results(grab, parse_index_size=False, anonymizer=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaError('Captcha found')

    elif anonymizer and grab.search(u'URL Error (0)'):

        # Common anonymizer error
        raise AnonymizerNetworkError('URL Error (0)')

    elif grab.css_exists('#ires'):
        if len(grab.css_list('#ires h3')):

            # Something was found
            if parse_index_size:
                index_size = parse_index_size(grab)
            else:
                index_size = None

            # Yield found results
            for elem in grab.css_list('h3.r a'):
                url = elem.get('href')
                if anonymizer:
                    match = ANONYMIZER_ARG.search(url)
                    if match:
                        token = urllib.unquote(match.group(1))
                        url = decode_entities(base64.b64decode(token))
                    else:
                        url = None
                        logging.error('Could not parse url encoded by anonymizer')

                if url:
                    yield {'url': url, 'title': get_node_text(elem),
                           'index_size': index_size}
        else:
            pass
            #return []
    else:
        raise ParsingError('Could not identify google page format')
コード例 #13
0
ファイル: document.py プロジェクト: sergithon/grab
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in
        `response.unicode_body()` else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))
コード例 #14
0
ファイル: document.py プロジェクト: Kuznitsin/grab
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in `response.unicode_body()`
            else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))