def request_url(url, headers=HEADERS): # 获取页面内容 r = requests.get(url, headers=headers) html_content = StringIO(r.content).read() # 自动解析编码类型 charset = chardet.detect(html_content) # 统一转换UTF8 return html_content.decode(charset['encoding']).encode("utf-8")
def writer(row, last): if isscalar(row) or row.ndim == 0: outfile.write(startindent + ' ' + str(row.filled().astype(ndarray))) return tmpstr = StringIO(bytes('', 'utf-8')) if ma.getmaskarray(row).all(): tmpstr.write(', '.join(['_'] * row.size) + ', ') else: savetxt(tmpstr, ma.filled(row), fmt, delimiter = ', ', newline =', ') if last: tmpstr.seek(-2, 1) tmpstr.write(b';') tmpstr.seek(0, 0) tmpstr = tmpstr.read() tmpstr = tmpstr.replace(bytes(fmt % getattr(row, 'fill_value', 0) + ',', 'utf-8'), bytes('_,', 'utf-8')) outfile.write(textwrap.fill(tmpstr.decode('utf-8'), line_length, initial_indent = startindent + ' ', subsequent_indent = startindent + ' ')) outfile.write('\n')
def for_book(cls, book, length=3): # count from this book only output = StringIO() wldoc = book.wldocument(parse_dublincore=False) output = wldoc.as_text(('raw-text',)).get_string() del wldoc conts = {} last_word = '' for letter in output.decode('utf-8').strip().lower(): mydict = conts.setdefault(last_word, {}) mydict.setdefault(letter, 0) mydict[letter] += 1 last_word = last_word[-length+1:] + letter # add children return reduce(cls.join_conts, (cls.get(child) for child in book.children.all().iterator()), conts)
def for_book(cls, book, length=3): # count from this book only output = StringIO() wldoc = book.wldocument(parse_dublincore=False) output = wldoc.as_text(('raw-text', )).get_string() del wldoc conts = {} last_word = '' for letter in output.decode('utf-8').strip().lower(): mydict = conts.setdefault(last_word, {}) mydict.setdefault(letter, 0) mydict[letter] += 1 last_word = last_word[-length + 1:] + letter # add children return reduce(cls.join_conts, (cls.get(child) for child in book.children.all().iterator()), conts)
def fetch(self, url): if url in self.idx: return self.idx[url] try: text = self.fetch_text(url) if isinstance(text, bytes): text = StringIO(text.decode('utf-8')) else: text = StringIO(text) text.name = url result = yaml.load(text) except yaml.parser.ParserError as e: raise validate.ValidationException("Syntax error %s" % (e)) if isinstance(result, dict) and self.identifiers: for identifier in self.identifiers: if identifier not in result: result[identifier] = url self.idx[self.expand_url(result[identifier], url)] = result else: self.idx[url] = result return result