Esempi in Python per normalize_url

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: lib.util.net

Metodo/funzione: normalize_url

Esempi su hotexamples.com: 3

normalize_url in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per lib.util.net.normalize_url, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

    def _decorate_article(self, article):
        """在 parse_response 後執行，後處理其輸出"""

        # html post-process
        from lxml.html import tostring, fromstring
        from bs4 import BeautifulSoup
        from lib.util.net import normalize_url
        from lib.util.text import pack_string

        # article['content'] may be list of lxml doms
        if type(article['content']) is list:
            article['content'] = \
              fromstring('\n'.join([tostring(x, encoding=unicode) for x in article['content']]))

        # remove unwanted tags
        self.css_sel_drop_tree(article['content'], ['script'])

        # prettify html with BeautifulSoup
        html_bs4 = BeautifulSoup(tostring(article['content'],
                                          encoding=unicode)).body.next

        article['text'] = pack_string(html_bs4.text)
        article['html'] = pack_string(unicode(html_bs4))
        article["ctlr_classname"] = str(self.__class__)

        article['url'] = normalize_url(article['url'])
        article['url_read'] = normalize_url(article['url_read'])
        article['url_canonical'] = normalize_url(article['url_canonical'])

        self.move_out_of_meta(article, 'title')

        return article

Esempio n. 2

Mostra file

File: base.py Progetto: dehao/news-diff

  def _decorate_article(self, article):
    """在 parse_response 後執行，後處理其輸出"""

    # html post-process
    from lxml.html import tostring, fromstring
    from bs4 import BeautifulSoup
    from lib.util.net import normalize_url
    from lib.util.text import pack_string

    # article['content'] may be list of lxml doms
    if type(article['content']) is list:
      article['content'] = \
        fromstring('\n'.join([tostring(x, encoding=unicode) for x in article['content']]))

    # remove unwanted tags
    self.css_sel_drop_tree(article['content'], ['script'])

    # prettify html with BeautifulSoup
    html_bs4 = BeautifulSoup(tostring(article['content'], encoding=unicode)).body.next

    article['text'] = pack_string(html_bs4.text)
    article['html'] = pack_string(unicode(html_bs4))
    article["ctlr_classname"] = str(self.__class__)

    article['url'] = normalize_url(article['url'])
    article['url_read'] = normalize_url(article['url_read'])
    article['url_canonical'] = normalize_url(article['url_canonical'])

    self.move_out_of_meta(article, 'title')

    return article

Esempio n. 3

Mostra file

def get_fresh_urls(urls, dbi = None):
  """篩選出 urls 中未曾成功抓取過者並回傳"""

  from lib.util.text import md5
  from lib.util.net import normalize_url

  if (len(urls) == 0): return set()

  url_md5 = [{'url': x, 'md5': md5(normalize_url(x))} for x in urls]
  hashes = "(" + (",".join(["UNHEX('%s')" % x['md5'] for x in url_md5 ])) + ")"
  sql = "SELECT HEX(`hash`) FROM `article__urls` WHERE `hash` IN %s" % hashes

  ret = set(DB.query(sql, dbi = dbi))

  output = []
  for x in url_md5:
    if not (x['md5'].upper(),) in ret:
      output.append(x['url'])

  return output