Beispiel #1
0
 def _extractor(txt, htmlpage=None):
     if txt is None:
         return
     m = ereg.search(txt)
     if m:
         return htmlregion(u"".join([g for g in m.groups() or m.group()
                                     if g]))
Beispiel #2
0
 def _process_css_and_xpath(self, annotations, selector):
     schema, modifiers, page = self.schema, self.modifiers, self.htmlpage
     region_ids = list(filter(bool, (region_id(r) for r in self.regions)))
     query = ','.join(('[data-tagid="%s"]' % rid for rid in region_ids))
     parents = {e._root for e in selector.css(query)}
     containers = ()
     if self.parent_region:
         if isinstance(self.parent_region, list):
             pquery = ', '.join(
                 '[data-tagid="{}"]'.format(self.get_region_id(r))
                 for r in self.parent_region)
         else:
             pquery = '[data-tagid="{}"]'.format(
                 self.get_region_id(self.parent_region))
         containers = {e._root for e in selector.css(pquery)}
     for i, a in enumerate(annotations, start=len(self.fields)):
         mode = a.get(u'selection_mode')
         query = a.get(mode if mode != 'css' else u'selector')
         try:
             elems = self._pick_elems(
                 getattr(selector, mode)(query), parents, containers)
         except ValueError:
             continue
         for elem in elems:
             elem._root.attrib.pop('data-tagid', None)
         extracted = elems.xpath(self.attribute_query(a)).extract()
         value = list(map(six.text_type.strip, extracted))
         aid = a.get(u'id') or i
         if value:
             value = [htmlregion(v) for v in arg_to_iter(value)]
             self.fields[aid] = ItemField(value, a, schema, modifiers, page)
         else:
             self.fields.pop(aid, None)
Beispiel #3
0
 def _extractor(txt, htmlpage=None):
     if txt is None:
         return
     m = ereg.search(txt)
     if m:
         return htmlregion(u"".join(
             [g for g in m.groups() or m.group() if g]))
    def safe_html(response):

        html_raw = response.xpath(
            '//div[contains(@class, "panel panel-default anenities")]'
        ).extract_first('').strip()
        html = replace_escape_chars(safehtml(htmlregion(html_raw))).replace(
            '<p></p>', '').strip()
        return html
Beispiel #5
0
    def extract_content(self, selector):
        """
            Extract Content of article
            @param selector Scrapy.Selector object (https://docs.scrapy.org/en/latest/topics/selectors.html)

            @return Text Content of article, return None if not found
        """
        t = lambda s: text(htmlregion(s))
        content = u' '.join(selector.css(self.config_selectors.get('ARTICLE_CONTENT')).extract())
        return t(content)
def raw_to_text(txt):
    return _text(htmlregion(txt))
Beispiel #7
0
 def _extractor(txt):
     m = ereg.search(txt)
     if m:
         return htmlregion(u"".join([g for g in m.groups() or m.group() if g]))
Beispiel #8
0
 def _extractor(txt):
     m = ereg.search(txt)
     if m:
         return htmlregion(u"".join(filter(None, m.groups() or m.group())))
Beispiel #9
0
 def _extractor(txt):
     m = ereg.search(txt)
     if m:
         return htmlregion(u"".join(filter(None, m.groups() or m.group())))
Beispiel #10
0
def extract_text(value):
    res = scrapely_extract_text(htmlregion(value))
    return res
Beispiel #11
0
def raw_to_text(txt):
    return _text(htmlregion(txt))