Example #1
0
def request_url(url, headers=HEADERS):
    # 获取页面内容
    r = requests.get(url, headers=headers)
    html_content = StringIO(r.content).read()

    # 自动解析编码类型
    charset = chardet.detect(html_content)

    # 统一转换UTF8
    return html_content.decode(charset['encoding']).encode("utf-8")
Example #2
0
 def writer(row, last):
     if isscalar(row) or row.ndim == 0:
         outfile.write(startindent + '  ' + str(row.filled().astype(ndarray)))
         return
     tmpstr = StringIO(bytes('', 'utf-8'))
     if ma.getmaskarray(row).all():
         tmpstr.write(', '.join(['_'] * row.size) + ', ')
     else:
         savetxt(tmpstr, ma.filled(row), fmt, delimiter = ', ', newline =', ')
     if last:
         tmpstr.seek(-2, 1)
         tmpstr.write(b';')
     tmpstr.seek(0, 0)
     tmpstr = tmpstr.read()
     tmpstr = tmpstr.replace(bytes(fmt % getattr(row, 'fill_value', 0) + ',', 'utf-8'), bytes('_,', 'utf-8'))
     outfile.write(textwrap.fill(tmpstr.decode('utf-8'), line_length, initial_indent = startindent + '  ', subsequent_indent = startindent + '    '))
     outfile.write('\n')
    def for_book(cls, book, length=3):
        # count from this book only
        output = StringIO()
        wldoc = book.wldocument(parse_dublincore=False)
        output = wldoc.as_text(('raw-text',)).get_string()
        del wldoc

        conts = {}
        last_word = ''
        for letter in output.decode('utf-8').strip().lower():
            mydict = conts.setdefault(last_word, {})
            mydict.setdefault(letter, 0)
            mydict[letter] += 1
            last_word = last_word[-length+1:] + letter
        # add children
        return reduce(cls.join_conts, 
                      (cls.get(child) for child in book.children.all().iterator()),
                      conts)
Example #4
0
    def for_book(cls, book, length=3):
        # count from this book only
        output = StringIO()
        wldoc = book.wldocument(parse_dublincore=False)
        output = wldoc.as_text(('raw-text', )).get_string()
        del wldoc

        conts = {}
        last_word = ''
        for letter in output.decode('utf-8').strip().lower():
            mydict = conts.setdefault(last_word, {})
            mydict.setdefault(letter, 0)
            mydict[letter] += 1
            last_word = last_word[-length + 1:] + letter
        # add children
        return reduce(cls.join_conts,
                      (cls.get(child)
                       for child in book.children.all().iterator()), conts)
Example #5
0
 def fetch(self, url):
     if url in self.idx:
         return self.idx[url]
     try:
         text = self.fetch_text(url)
         if isinstance(text, bytes):
             text = StringIO(text.decode('utf-8'))
         else:
             text = StringIO(text)
         text.name = url
         result = yaml.load(text)
     except yaml.parser.ParserError as e:
         raise validate.ValidationException("Syntax error %s" % (e))
     if isinstance(result, dict) and self.identifiers:
         for identifier in self.identifiers:
             if identifier not in result:
                 result[identifier] = url
             self.idx[self.expand_url(result[identifier], url)] = result
     else:
         self.idx[url] = result
     return result
Example #6
0
 def fetch(self, url):
     if url in self.idx:
         return self.idx[url]
     try:
         text = self.fetch_text(url)
         if isinstance(text, bytes):
             text = StringIO(text.decode('utf-8'))
         else:
             text = StringIO(text)
         text.name = url
         result = yaml.load(text)
     except yaml.parser.ParserError as e:
         raise validate.ValidationException("Syntax error %s" % (e))
     if isinstance(result, dict) and self.identifiers:
         for identifier in self.identifiers:
             if identifier not in result:
                 result[identifier] = url
             self.idx[self.expand_url(result[identifier], url)] = result
     else:
         self.idx[url] = result
     return result