def get_title(url): """Fetches the contents of url and extracts (and utf-8 encodes) the contents of <title>""" if not url or not url.startswith(('http://', 'https://')): return None try: req = Request(url) if g.useragent: req.add_header('User-Agent', g.useragent) opener = urlopen(req, timeout=15) # determine the encoding of the response for param in opener.info().getplist(): if param.startswith("charset="): param_name, sep, charset = param.partition("=") codec = codecs.getreader(charset) break else: codec = codecs.getreader("utf-8") with codec(opener, "ignore") as reader: # Attempt to find the title in the first 1kb data = reader.read(1024) title = extract_title(data) # Title not found in the first kb, try searching an additional 10kb if not title: data += reader.read(10240) title = extract_title(data) return title except: return None
def get_vcf_handle(fsock=None, infile=None): """Open the vcf file and return a handle""" vcf = None if (fsock or infile): if fsock: # if not infile and hasattr(fsock, 'name'): logger.info("Reading vcf form stdin") if sys.version_info < (3, 0): logger.info("Using codecs to read stdin") sys.stdin = getreader('utf-8')(fsock) vcf = sys.stdin else: logger.info("Reading vcf from file {0}".format(infile)) file_name, file_extension = os.path.splitext(infile) if file_extension == '.gz': logger.debug("Vcf is zipped") vcf = getreader('utf-8')(gzip.open(infile), errors='replace') elif file_extension == '.vcf': vcf = open(infile, mode='r', encoding='utf-8', errors='replace') else: raise IOError("File is not in a supported format!\n" " Or use correct ending(.vcf or .vcf.gz)") else: raise IOError("Please provide a fsock or infile") return vcf
def _make_tempfile(self): transfer_encoding = self.headers.get('content-transfer-encoding', '').lower() tf = NamedTemporaryFile() start_pos = self._pos + self._headers_length + 2 file_length = (self._endpos - 2) - start_pos bytes_read = 0 self._data.seek(start_pos) while bytes_read < file_length: remaining_bytes = (self._endpos - 2) - self._data.tell() chunk_size = min(8196, remaining_bytes) tf.write(self._data.read(chunk_size)) bytes_read += chunk_size tf.seek(0) if transfer_encoding not in ('', '7bit', '8bit', 'binary'): decoded_tf = NamedTemporaryFile() mimetools.decode(tf, decoded_tf, transfer_encoding) try: return codecs.getreader(self.charset)(decoded_tf) except (TypeError, LookupError): return decoded_tf else: try: return codecs.getreader(self.charset)(tf) except (TypeError, LookupError): return tf
def main(): parser = argparse.ArgumentParser() parser.add_argument('config', nargs='*', type=argparse.FileType('r'), help="One or more YAML files to read") parser.add_argument('--template-file', '-f', dest='template', type=argparse.FileType('r'), default=sys.stdin, help="Config file template. If not supplied, " "stdin is used") parser.add_argument('--out', '-o', dest='out', type=argparse.FileType('w'), default=sys.stdout, help="Output file to write. If not supplied, " "stdout is used") parser.add_argument('--strict', dest='strict', action='store_true', default=False, help="Raise an exception on undefined variables") args = parser.parse_args() context = {} for file in args.config: context.update(yaml.load(getreader('utf-8')(file).read())) template_string = getreader('utf-8')(args.template).read() rendered = render(template_string, context, args.strict) getwriter('utf-8')(args.out).write(rendered)
def pull(format, stream,kwargs): if kwargs.get('utf8_cleanup', False): stream = UTF8RecoderWithCleanup(stream, kwargs.get('encoding', 'utf-8')) elif codecs.getreader(kwargs.get('encoding', 'utf-8')) != codecs.getreader('utf-8'): stream = UTF8Recoder(stream, kwargs.get('encoding', None)) else: pass delimiter = kwargs.get('delimiter', None) sniff_read = stream.next() stream = PrefixReader(sniff_read, stream, linefilter=kwargs.get("linefilter", None)) dialect = csv.Sniffer().sniff(sniff_read) if sniff_read.endswith('\r\n'): dialect.lineterminator = '\r\n' else: dialect.lineterminator = '\n' if dialect.delimiter.isalpha() and not delimiter: # http://bugs.python.org/issue2078 for row in linepull(stream, dialect, kwargs): yield row return if delimiter: dialect.delimiter = delimiter for row in csvpull(stream, dialect, kwargs): yield row
def load_json_file(fname): if fname.endswith(".bz2"): reader = codecs.getreader("utf-8")(BZ2File(fname)) else: reader = codecs.getreader("utf-8")(open(fname)) dat = reader.read() return json.loads(dat)
def test_exceed(self): infiles = ['%d.txt' % i for i in range(1, 7)] content = {} for infile in infiles: with open(path_join(self.tmpdir, infile), 'w', encoding = 'ascii') as f: content[infile] = ('FILE=%s' % infile) f.write(content[infile]) sleep(0.1) for infile in infiles[:5]: with self.cache[infile] as entry: reader = getreader('ascii')(entry) data = reader.read() self.assertEqual('TOUCHED\n' + content[infile], data) sleep(0.1) self.assertEqual(self.count, 5) self.assertTrue(len(self.cache) <= 5) infile = infiles[5] with self.cache[infile] as entry: reader = getreader('ascii')(entry) data = reader.read() self.assertEqual('TOUCHED\n' + content[infile], data) sleep(0.1) self.assertEqual(self.count, 6) # Should cause an entry to be dropped (it blocks while scrubbing) infile = infiles[0] with self.cache[infile] as entry: reader = getreader('ascii')(entry) data = reader.read() self.assertEqual('TOUCHED\n' + content[infile], data) sleep(0.1) self.assertEqual(self.count, 7) self.assertTrue(len(self.cache) <= 5)
def run_clang_format_diff(args, file_name): try: with io.open(file_name, 'r', encoding='utf-8') as f: original = f.readlines() except IOError as exc: raise DiffError(str(exc)) invocation = [args.clang_format_executable, file_name] try: proc = subprocess.Popen( invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) except OSError as exc: raise DiffError(str(exc)) proc_stdout = proc.stdout proc_stderr = proc.stderr if sys.version_info[0] < 3: # make the pipes compatible with Python 3, # reading lines should output unicode encoding = 'utf-8' proc_stdout = codecs.getreader(encoding)(proc_stdout) proc_stderr = codecs.getreader(encoding)(proc_stderr) # hopefully the stderr pipe won't get full and block the process outs = list(proc_stdout.readlines()) errs = list(proc_stderr.readlines()) proc.wait() if proc.returncode: raise DiffError("clang-format exited with status {}: '{}'".format( proc.returncode, file_name), errs) return make_diff(file_name, original, outs), errs
def xopen(filename, mode='r'): """ Replacement for the "open" function that can also open files that have been compressed with gzip or bzip2. If the filename is '-', standard output (mode 'w') or input (mode 'r') is returned. If the filename ends with .gz, the file is opened with gzip.open(). If it ends with .bz2, it's opened as a bz2.BZ2File. Otherwise, the regular open() is used. """ assert isinstance(filename, basestring) if filename == '-': return sys.stdin if 'r' in mode else sys.stdout if filename.endswith('.bz2'): if bz2 is None: raise ImportError("Cannot open bz2 files: The bz2 module is not available") if sys.version_info[0] < 3: return bz2.BZ2File(filename, mode) else: if 'r' in mode: return getreader('ascii')(bz2.BZ2File(filename, mode)) else: return getwriter('ascii')(bz2.BZ2File(filename, mode)) elif filename.endswith('.gz'): if sys.version_info[0] < 3: if 'r' in mode: return buffered_reader(gzip.open(filename, mode)) else: return buffered_writer(gzip.open(filename, mode)) else: if 'r' in mode: return getreader('ascii')(gzip.open(filename, mode)) else: return getwriter('ascii')(gzip.open(filename, mode)) else: return open(filename, mode)
def __init__(self, source = '-', output_file = "", enable_hashing = False, output_type = "png", scale = 1.0, width = None, height = None): if source == '-': source = codecs.getreader('utf-8')(sys.stdin).readlines() else: file_data = codecs.getreader('utf-8')(open(source, 'r')) source = list(file_data) self.__parsers = [] self.__backends = [] self.__source = source self.__original_source = copy.copy(source) self.__outfile = output_file self.__enable_hashing = enable_hashing self.__additional_source = str(scale) + str(width) + str(height) if not enable_hashing or not hash_check(source + [self.__additional_source], output_file + ".md5"): self.register_parser(YamlParser()) self.register_parser(BackgroundParser()) self.register_parser(TextParser()) self.register_parser(OverlayParser()) self.register_parser(ArrowParser()) self.register_parser(NameParser()) self.register_parser(StyleParser()) backends = { 'svg': CairoSvgBackend, 'pdf': CairoPdfBackend, 'eps': CairoEpsBackend, 'png': CairoBackend } if output_type in backends: self.register_backend(backends[output_type](image_scale = scale, image_width = width, image_height = height)) else: self.register_backend(CairoBackend(image_scale = scale, image_width = width, image_height = height))
def checkEncoding(fileObj): ''' Check that a file honors the declared encoding (default ASCII for Python 2 and UTF-8 for Python 3). Raises a UnicodeDecodeError in case of decoding problems and LookupError if the specified codec does not exists. See http://www.python.org/dev/peps/pep-0263/ ''' from itertools import islice # default encoding if sys.version_info[0] <= 2: enc = 'ascii' else: enc = 'utf-8' # find the encoding of the file, if specified (in the first two lines) enc_exp = re.compile(r"coding[:=]\s*([-\w.]+)") for l in islice(fileObj, 2): m = enc_exp.search(l) if m: enc = m.group(1) break if hasattr(fileObj, 'name'): logging.getLogger('checkEncoding').debug('checking encoding %s on %s', enc, fileObj.name) else: logging.getLogger('checkEncoding').debug('checking encoding %s on file object', enc) # try to read the file with the declared encoding fileObj.seek(0) codecs.getreader(enc)(fileObj).read()
def __init__ (self, langs=[], dictionaryPath="", addPath="", delPath="", full=False, addLang=False, reallyAdd = False): self.langs = langs self.dictionary = semantics.SemanticDictionary(dictionaryPath) self.full = full self.addLang = addLang self.reallyAdd = reallyAdd self.Add = {} if addPath: f = codecs.getreader("windows-1251")(file(addPath, "rb")) for l in f: x = l.replace(u"<ana", "@") \ .replace("lex=", "") \ .replace("gr=", "") \ .replace("/>", "") \ .replace(">", "") \ .replace("\"", " ") \ .replace("=", ",") \ .rstrip() \ .split("@") form = x[0].lstrip().rstrip() if form not in self.Add: self.Add[form] = [] for el in x[1:]: s = el.lstrip().rstrip().split() lemma = s[0] gramm = s[1] (head, _, tail) = gramm.partition("(") head = head.split(",") category = head[0] head = set(head) head.discard("") tail = (tail.partition(")")[0]).split("|") res = [] for tl in tail: s = set(tl.split(",")) s.discard("") res.append(self.createAttrs("", lemma, category, head, s)) self.Add[form].append((lemma, res, 'ru', 'disamb')) f.close() self.Del = set() self.DelPatterns = [] if delPath: f = codecs.getreader("windows-1251")(file(delPath, "rb")) for l in f: x = l.rstrip().split() if x[0].endswith("*"): self.DelPatterns.append((x[0][:-1], x[1], set(x[2].split(',')))) else: self.Del.add(tuple(x[0:3])) f.close()
def download(datafolder): #download daily 10 day forecast #example url: http://api.wunderground.com/api/944b3f3c879d2394/geolookup/forecast10day/q/Germany/Berlin.json for loc in loc_list: #get the json object f = urllib.request.urlopen('http://api.wunderground.com/api/944b3f3c879d2394/geolookup/forecast10day/q/Germany/'+loc+'.json') #need to convert byte object to string reader = codecs.getreader('utf-8') #how is data encoded? parsed_json = json.load(reader(f)) fn_hourly = os.path.join(datafolder,"wunderground_" + time.strftime("%d_%m_%Y_%H_%M_") + loc + "_hourly.pkl") with open(fn_hourly, 'wb') as p: pickle.dump(parsed_json, p, pickle.HIGHEST_PROTOCOL) print (parsed_json['location']['city'] + ' 10 days downloaded') time.sleep(10) #hourly data # example url: http://api.wunderground.com/api/944b3f3c879d2394/geolookup/hourly/q/Germany/Berlin.json f = urllib.request.urlopen('http://api.wunderground.com/api/944b3f3c879d2394/geolookup/hourly/q/Germany/'+loc+'.json') reader = codecs.getreader('utf-8') #how is data encoded? parsed_json = json.load(reader(f)) fn_10days = os.path.join(datafolder,"wunderground_" + time.strftime("%d_%m_%Y_%H_%M_") + loc + "_10days.pkl") with open(fn_10days, 'wb') as p: pickle.dump(parsed_json, p, pickle.HIGHEST_PROTOCOL) print (parsed_json['location']['city'] + ' hourly downloaded') time.sleep(10) f.close()
def xopen(filename, mode='r'): """ Replacement for the "open" function that can also open files that have been compressed with gzip. If the filename ends with .gz, the file is opened with gzip.open(). If it doesn't, the regular open() is used. If the filename is '-', standard output (mode 'w') or input (mode 'r') is returned. """ assert isinstance(filename, str) if filename == '-': return sys.stdin if 'r' in mode else sys.stdout if filename.endswith('.gz'): if sys.version_info[0] < 3: if 'r' in mode: return buffered_reader(gzip.open(filename, mode)) else: return gzip.open(filename, mode) else: if 'r' in mode: return getreader('ascii')(gzip.open(filename, mode)) else: return getwriter('ascii')(gzip.open(filename, mode)) elif filename.endswith('.bz2'): if sys.version_info[0] < 3: return bz2.BZ2File(filename, mode) else: if 'r' in mode: return getreader('ascii')(bz2.BZ2File(filename, mode)) else: return getwriter('ascii')(bz2.BZ2File(filename, mode)) else: return open(filename, mode)
def process_args(args): if not (args.ml or args.rb): args.rb = True if args.infile: ifp = io.open(args.infile, encoding='utf-8') else: if sys.version_info[0] >= 3: ifp = codecs.getreader('utf8')(sys.stdin.buffer) else: ifp = codecs.getreader('utf8')(sys.stdin) if args.outfile: ofp = io.open(args.outfile, mode='w', encoding='utf-8') else: if sys.version_info[0] >= 3: ofp = codecs.getwriter('utf8')(sys.stdout.buffer) else: ofp = codecs.getwriter('utf8')(sys.stdout) # initialize transliterator object trn = Transliterator(args.source, args.target, rb=args.rb, build_lookup=args.build_lookup) # transliterate text for line in ifp: tline = trn.convert(line) ofp.write(tline) # close files ifp.close() ofp.close()
def fopen(s, enc="utf-8"): """Opens the indicated file, handling special cases including None, "-", "stdin" (indicating stdin), and "stderr", indicating stderr. For files that end in ".gz" or ".bz2", automatically handles decompression""" if not s or s == '-': LOG.info("Returning sys.stdin") return sys.stdin # Handle http(s): if s.startswith('http://') or s.startswith('https://'): r = requests.get(s, stream=True) return r.raw if enc == 'b' else codecs.getreader(enc)(r.raw) fos = [] fnames = glob.glob(s) if not fnames: raise IOError("No such file: %s" % s) for f in fnames: ext = f.rsplit(".", 1)[-1] if ext == "bz2": fo = bz2.BZ2File(f, 'r', 10*1024) elif ext == "gz": fo = gzip.open(f, 'rb') else: fo = open(f, 'rb') # Encoding handled below fos = itertools.chain(fos, fo) if len(fnames) > 1 else fo # Wrap the raw file handle into one that can decode # Wikipedia needs this return fos if enc == 'b' else codecs.getreader(enc)(fos)
def initialize(self, arg): self.clean_words = set() logger.info('Reading in clean words...') reader = codecs.getreader('utf8')(BZ2File(CleanWordsFile)) for line in reader.readlines(): (word,doc_count,_) = line.split('\t') doc_count = int(doc_count) if word and doc_count > MinVocabDocThreshold: self.clean_words.add(word) reader.close() logger.info('done.') # Read in document link weights self.clean_docs = set() logger.info('Reading in clean docs...') reader = codecs.getreader('utf8')(BZ2File(DocumentLinksFile)) for line in reader.readlines(): (doc,incoming,outgoing) = line.split('\t') incoming = int(incoming) outgoing = int(outgoing) if doc and incoming >= MinIncomingLinkWeight: self.clean_docs.add(doc) reader.close() logger.info('done.')
def wrap_fp(fp): if suffix == ".gz": fp = GzipFile(fileobj=fp, mode=mode) elif suffix == ".bz2": try: fp = BZ2File(fp, mode=mode) except TypeError: if sys.version_info < (3, 0, 0): raise NotImplementedError("built-in BZ2File is partially broken in python 2, install bz2file from pypi or use a compression setting other than 'bz2'") else: raise elif suffix == ".xz": fp = LZMAFile(fp, mode=mode) if (suffix or sys.version_info < (3,)) and "b" not in mode: # If mode is not binary (and we expect to be able to # write() str values, not bytes), need need to create # an additional encoding wrapper. That encoder can # probably use UTF-8 without any need for additional # configuration if "r" in mode and "w" in mode: fp = StreamReaderWriter(fp, codecs.getreader("utf-8"), codecs.getwriter("utf-8")) elif "w" in mode: fp = codecs.getwriter("utf-8")(fp) elif suffix: fp = codecs.getreader("utf-8")(fp) fp.realname = filename return fp
def _bleu(ref_file, trans_file, subword_option=None): """Compute BLEU scores and handling BPE.""" max_order = 4 smooth = False ref_files = [ref_file] reference_text = [] for reference_filename in ref_files: with codecs.getreader("utf-8")( tf.gfile.GFile(reference_filename, "rb")) as fh: reference_text.append(fh.readlines()) per_segment_references = [] for references in zip(*reference_text): reference_list = [] for reference in references: reference = _clean(reference, subword_option) reference_list.append(reference.split(" ")) per_segment_references.append(reference_list) translations = [] with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh: for line in fh: line = _clean(line, subword_option=None) translations.append(line.split(" ")) # bleu_score, precisions, bp, ratio, translation_length, reference_length bleu_score, _, _, _, _, _ = bleu.compute_bleu( per_segment_references, translations, max_order, smooth) return 100 * bleu_score
def test_badbom(self): s = StringIO.StringIO("\xff\xff") f = codecs.getreader(self.encoding)(s) self.assertRaises(UnicodeError, f.read) s = StringIO.StringIO("\xff\xff\xff\xff") f = codecs.getreader(self.encoding)(s) self.assertRaises(UnicodeError, f.read)
def delete_rows(to_delete, inpath, outpath): delete = set([el.strip().split(";")[0].replace("/", "\\") for el in codecs.getreader("windows-1251")(file(to_delete, "rb"), 'xmlcharrefreplace').readlines()[1:]]) f = codecs.getreader("windows-1251")(file(inpath, "rb"), 'xmlcharrefreplace') ff = codecs.getwriter("windows-1251")(file(outpath, "wb"), 'xmlcharrefreplace') for el in f: if not el.strip().split(";")[0].replace("/", "\\") in delete: ff.write(el) else: print el.strip().split(";")[0].replace("/", "\\")
def run_clang_format_diff(args, file): try: with io.open(file, 'r', encoding='utf-8') as f: original = f.readlines() except IOError as exc: raise DiffError(str(exc)) invocation = [args.clang_format_executable, file] # Use of utf-8 to decode the process output. # # Hopefully, this is the correct thing to do. # # It's done due to the following assumptions (which may be incorrect): # - clang-format will returns the bytes read from the files as-is, # without conversion, and it is already assumed that the files use utf-8. # - if the diagnostics were internationalized, they would use utf-8: # > Adding Translations to Clang # > # > Not possible yet! # > Diagnostic strings should be written in UTF-8, # > the client can translate to the relevant code page if needed. # > Each translation completely replaces the format string # > for the diagnostic. # > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation # # It's not pretty, due to Python 2 & 3 compatibility. encoding_py3 = {} if sys.version_info[0] >= 3: encoding_py3['encoding'] = 'utf-8' try: proc = subprocess.Popen( invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, **encoding_py3) except OSError as exc: raise DiffError(str(exc)) proc_stdout = proc.stdout proc_stderr = proc.stderr if sys.version_info[0] < 3: # make the pipes compatible with Python 3, # reading lines should output unicode encoding = 'utf-8' proc_stdout = codecs.getreader(encoding)(proc_stdout) proc_stderr = codecs.getreader(encoding)(proc_stderr) # hopefully the stderr pipe won't get full and block the process outs = list(proc_stdout.readlines()) errs = list(proc_stderr.readlines()) proc.wait() if proc.returncode: raise DiffError("clang-format exited with status {}: '{}'".format( proc.returncode, file), errs) return make_diff(file, original, outs), errs
def decoded(f): bom = f.read(2) f.seek(0) # Older versions of Zemax wrote plain ascii files. The # output txt file format can be selected in the # preferences dialog box. if bom == codecs.BOM_UTF16: reader = codecs.getreader("utf-16") else: reader = codecs.getreader("utf-8") return reader(f)
def gmail_csv_unicode_open(fin): 'only supports utf-16 and utf-8' # byte order mark t = fin.read(2) if t == codecs.BOM_UTF16_LE: return UTF8Lines(codecs.getreader('utf-16le')(fin)) elif t == codecs.BOM_UTF16_BE: return UTF8Lines(codecs.getreader('utf-16be')(fin)) else: # assume utf-8 l = fin.readline() return itertools.chain([ t + l ], fin)
def __enter__(self): try: with gzip.open(self.filename, 'rb') as handle: with codecs.getreader('utf-8')(handle) as reader: reader.read(50) self._gzHandle = gzip.open(self.filename, 'rb') self._reader = codecs.getreader('utf-8')(self._gzHandle) except IOError: self._gzHandle = None self._reader = codecs.getreader('utf-8')(open(self.filename)) return self._reader
def __init__(self, host=PUDB_RDB_HOST, port=PUDB_RDB_PORT, port_search_limit=100, out=sys.stdout, term_size=None): self.active = True self.out = out self._prev_handles = sys.stdin, sys.stdout self._sock, this_port = self.get_avail_port( host, port, port_search_limit) self._sock.setblocking(1) self._sock.listen(1) self.ident = '{0}:{1}'.format(self.me, this_port) self.host = host self.port = this_port self.say(BANNER.format(self=self)) self._client, address = self._sock.accept() self._client.setblocking(1) self.remote_addr = ':'.join(str(v) for v in address) self.say(SESSION_STARTED.format(self=self)) # makefile ignores encoding if there's no buffering. raw_sock_file = self._client.makefile("rwb", 0) import codecs if sys.version_info[0] < 3: sock_file = codecs.StreamRecoder( raw_sock_file, codecs.getencoder("utf-8"), codecs.getdecoder("utf-8"), codecs.getreader("utf-8"), codecs.getwriter("utf-8")) else: sock_file = codecs.StreamReaderWriter( raw_sock_file, codecs.getreader("utf-8"), codecs.getwriter("utf-8")) self._handle = sys.stdin = sys.stdout = sock_file import telnetlib as tn raw_sock_file.write(tn.IAC + tn.WILL + tn.SGA) resp = raw_sock_file.read(3) assert resp == tn.IAC + tn.DO + tn.SGA raw_sock_file.write(tn.IAC + tn.WILL + tn.ECHO) resp = raw_sock_file.read(3) assert resp == tn.IAC + tn.DO + tn.ECHO Debugger.__init__(self, stdin=self._handle, stdout=self._handle, term_size=term_size)
def end_task(self): logger.info('Computing normalization factors...') total_bigrams = 0 for (i, shard) in enumerate(self.shuffle_result_shards): logger.info(' processing shard %d' % i) f = codecs.getreader('utf8')(BZ2File(shard)) for (line_no, line) in enumerate(f.readlines()): # print line.encode('utf8','replace'), try: (word, word2, freq, doc_freq) = line.split('\t') self.word_freq.setdefault(word, 0) self.word_freq[word] += int(freq) total_bigrams += int(freq) except ValueError: logger.info('Line %d of %s is bad.' % (line_no, shard)) f.close() total_words = sum(self.word_freq.values()) # Now write the output to disk logger.info('Writing to disk...') writer = codecs.getwriter('utf8')(BZ2File(OutputFile, 'w')) for (i, shard) in enumerate(self.shuffle_result_shards): logger.info(' processing shard %d' % i) f = codecs.getreader('utf8')(BZ2File(shard)) for (line_no, line) in enumerate(f.readlines()): # print line.encode('utf8','replace'), try: (word, word2, co_occurrence_sum, document_freq_sum) = line.split('\t') co_occurrence_sum = int(co_occurrence_sum) document_freq_sum = int(document_freq_sum) try: #pmi = log(co_occurrence_sum) - log(total_bigrams) \ # - log(self.word_freq[word]) - log(self.word_freq[word2]) + 2*log(total_words) # f.write('%s\t%s\t%f\t%d\t%d\n' % (word, word2, pmi, freq, self.document_freq[(word,word2)])) #writer.write('%s\t%s\t%f\t%d\t%d\n' % (word, word2, pmi, co_occurrence_sum, document_freq_sum)) writer.write('%s\t%s\t%d\t%d\n' % (word, word2, co_occurrence_sum, document_freq_sum)) except KeyError: logger.info('Line %d of %s is bad.' % (line_no, shard)) except ValueError: logger.info('Line %d of %s is bad.' % (line_no, shard)) f.close() writer.close() logger.info('done.') sys.exit()
def _make_file(self): start_pos = self._pos + self._headers_length + 2 chunkfile = MMapChunk(self._data, start_pos, self._endpos - 2) transfer_encoding = self.headers.get('content-transfer-encoding', '').lower() if transfer_encoding not in ('', '7bit', '8bit', 'binary'): try: chunkfile = codecs.getreader(transfer_encoding)(chunkfile) except (TypeError, LookupError): pass try: return codecs.getreader(self.charset)(chunkfile) except (TypeError, LookupError): return chunkfile
def _accuracy(label_file, pred_file): """Compute accuracy, each line contains a label.""" with codecs.getreader("utf-8")(tf.gfile.GFile(label_file, "rb")) as label_fh: with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "rb")) as pred_fh: count = 0.0 match = 0.0 for label in label_fh: label = label.strip() pred = pred_fh.readline().strip() if label == pred: match += 1 count += 1 return 100 * match / count
def callMethod(method, *params): """Call any JSON-RPC method""" def base64_enc(auth): """ Jump through python3 hoops to encode base64 string """ return base64.encodebytes(auth.encode('ascii')).decode('ascii').replace('\n', '') callId = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) request = urllib.request.Request('http://%s:%d' % ( CONFIG['rpc_host'], CONFIG['rpc_port'], ), json.dumps(dict( jsonrpc="1.0", id=callId, method=method, params=params, )).encode('utf8'), { "Authorization": "Basic %s" % base64_enc('%s:%s' % (CONFIG['rpc_user'], CONFIG['rpc_pass'])), "Content-Type": "application/json", }) # Do the request, parse response try: response = urllib.request.urlopen(request) out = json.load(codecs.getreader('utf8')(response)) except urllib.request.HTTPError as e: try: out = json.load(codecs.getreader('utf8')(e)) if not out['error']: out['error'] = dict(message='', code=e.code) except ValueError: e.seek(0) out = dict(id=callId, error=dict( message=" ".join([e.msg, e.read().decode('utf8')]), code=e.code, )) if out['id'] != callId: raise ValueError("Response ID %s doesn't match %s" % (out['id'], callId)) if out['error'] is None: return out['result'] if out['error']['message'] == "Invalid Smileycoin address": raise ValueError(out['error']['message']) raise RuntimeError("%s (%d)" % (out['error']['message'], out['error']['code']))
stats[most_frequent])) outfile.write('{0} {1}\n'.format(*most_frequent)) changes = replace_pair(most_frequent, sorted_vocab, indices) update_pair_statistics(most_frequent, changes, stats, indices) stats[most_frequent] = 0 if not i % 100: prune_stats(stats, big_stats, threshold) if __name__ == '__main__': # python 2/3 compatibility if sys.version_info < (3, 0): sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) sys.stdin = codecs.getreader('UTF-8')(sys.stdin) else: sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) parser = create_parser() args = parser.parse_args() # read/write files as UTF-8 if args.input.name != '<stdin>': args.input = codecs.open(args.input.name, encoding='utf-8') if args.output.name != '<stdout>': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') main(args.input,
def check_graphic(self): """ Check the hash of the current matplotlib figure matches the expected image hash for the current graphic test. To create missing image test results, set the IRIS_TEST_CREATE_MISSING environment variable before running the tests. This will result in new and appropriately "<hash>.png" image files being generated in the image output directory, and the imagerepo.json file being updated. """ import imagehash from PIL import Image dev_mode = os.environ.get('IRIS_TEST_CREATE_MISSING') unique_id = self._unique_id() repo_fname = os.path.join(_RESULT_PATH, 'imagerepo.json') with open(repo_fname, 'rb') as fi: repo = json.load(codecs.getreader('utf-8')(fi)) try: #: The path where the images generated by the tests should go. image_output_directory = os.path.join(os.path.dirname(__file__), 'result_image_comparison') if not os.access(image_output_directory, os.W_OK): if not os.access(os.getcwd(), os.W_OK): raise IOError('Write access to a local disk is required ' 'to run image tests. Run the tests from a ' 'current working directory you have write ' 'access to to avoid this issue.') else: image_output_directory = os.path.join( os.getcwd(), 'iris_image_test_output') result_fname = os.path.join(image_output_directory, 'result-' + unique_id + '.png') if not os.path.isdir(image_output_directory): # Handle race-condition where the directories are # created sometime between the check above and the # creation attempt below. try: os.makedirs(image_output_directory) except OSError as err: # Don't care about "File exists" if err.errno != 17: raise def _create_missing(): fname = '{}.png'.format(phash) base_uri = ('https://scitools.github.io/test-iris-imagehash/' 'images/v4/{}') uri = base_uri.format(fname) hash_fname = os.path.join(image_output_directory, fname) uris = repo.setdefault(unique_id, []) uris.append(uri) print('Creating image file: {}'.format(hash_fname)) figure.savefig(hash_fname) msg = 'Creating imagerepo entry: {} -> {}' print(msg.format(unique_id, uri)) lock = filelock.FileLock( os.path.join(_RESULT_PATH, 'imagerepo.lock')) # The imagerepo.json file is a critical resource, so ensure # thread safe read/write behaviour via platform independent # file locking. with lock.acquire(timeout=600): with open(repo_fname, 'wb') as fo: json.dump(repo, codecs.getwriter('utf-8')(fo), indent=4, sort_keys=True) # Calculate the test result perceptual image hash. buffer = io.BytesIO() figure = plt.gcf() figure.savefig(buffer, format='png') buffer.seek(0) phash = imagehash.phash(Image.open(buffer), hash_size=_HASH_SIZE) if unique_id not in repo: if dev_mode: _create_missing() else: figure.savefig(result_fname) emsg = 'Missing image test result: {}.' raise AssertionError(emsg.format(unique_id)) else: uris = repo[unique_id] # Extract the hex basename strings from the uris. hexes = [ os.path.splitext(os.path.basename(uri))[0] for uri in uris ] # Create the expected perceptual image hashes from the uris. to_hash = imagehash.hex_to_hash expected = [to_hash(uri_hex) for uri_hex in hexes] # Calculate hamming distance vector for the result hash. distances = [e - phash for e in expected] if np.all([hd > _HAMMING_DISTANCE for hd in distances]): if dev_mode: _create_missing() else: figure.savefig(result_fname) msg = ('Bad phash {} with hamming distance {} ' 'for test {}.') msg = msg.format(phash, distances, unique_id) if _DISPLAY_FIGURES: emsg = 'Image comparison would have failed: {}' print(emsg.format(msg)) else: emsg = 'Image comparison failed: {}' raise AssertionError(emsg.format(msg)) if _DISPLAY_FIGURES: plt.show() finally: plt.close()
import re import os.path import gzip import tempfile import shutil import atexit # Use word_tokenize to split raw text into words from string import punctuation import nltk from nltk.tokenize import word_tokenize scriptdir = os.path.dirname(os.path.abspath(__file__)) reader = codecs.getreader('utf8') writer = codecs.getwriter('utf8') def prepfile(fh, code): if type(fh) is str: fh = open(fh, code) ret = gzip.open(fh.name, code if code.endswith("t") else code+"t") if fh.name.endswith(".gz") else fh if sys.version_info[0] == 2: if code.startswith('r'): ret = reader(fh) elif code.startswith('w'): ret = writer(fh) else: sys.stderr.write("I didn't understand code "+code+"\n") sys.exit(1) return ret
def reset(self): self.dataStream = codecs.getreader(self.charEncoding[0])( self.rawStream, 'replace') HTMLUnicodeInputStream.reset(self)
def test_bug1175396(self): s = [ '<%!--===================================================\r\n', ' BLOG index page: show recent articles,\r\n', ' today\'s articles, or articles of a specific date.\r\n', '========================================================--%>\r\n', '<%@inputencoding="ISO-8859-1"%>\r\n', '<%@pagetemplate=TEMPLATE.y%>\r\n', '<%@import=import frog.util, frog%>\r\n', '<%@import=import frog.objects%>\r\n', '<%@import=from frog.storageerrors import StorageError%>\r\n', '<%\r\n', '\r\n', 'import logging\r\n', 'log=logging.getLogger("Snakelets.logger")\r\n', '\r\n', '\r\n', 'user=self.SessionCtx.user\r\n', 'storageEngine=self.SessionCtx.storageEngine\r\n', '\r\n', '\r\n', 'def readArticlesFromDate(date, count=None):\r\n', ' entryids=storageEngine.listBlogEntries(date)\r\n', ' entryids.reverse() # descending\r\n', ' if count:\r\n', ' entryids=entryids[:count]\r\n', ' try:\r\n', ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n', ' except StorageError,x:\r\n', ' log.error("Error loading articles: "+str(x))\r\n', ' self.abort("cannot load articles")\r\n', '\r\n', 'showdate=None\r\n', '\r\n', 'arg=self.Request.getArg()\r\n', 'if arg=="today":\r\n', ' #-------------------- TODAY\'S ARTICLES\r\n', ' self.write("<h2>Today\'s articles</h2>")\r\n', ' showdate = frog.util.isodatestr() \r\n', ' entries = readArticlesFromDate(showdate)\r\n', 'elif arg=="active":\r\n', ' #-------------------- ACTIVE ARTICLES redirect\r\n', ' self.Yredirect("active.y")\r\n', 'elif arg=="login":\r\n', ' #-------------------- LOGIN PAGE redirect\r\n', ' self.Yredirect("login.y")\r\n', 'elif arg=="date":\r\n', ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n', ' showdate = self.Request.getParameter("date")\r\n', ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n', ' entries = readArticlesFromDate(showdate)\r\n', 'else:\r\n', ' #-------------------- RECENT ARTICLES\r\n', ' self.write("<h2>Recent articles</h2>")\r\n', ' dates=storageEngine.listBlogEntryDates()\r\n', ' if dates:\r\n', ' entries=[]\r\n', ' SHOWAMOUNT=10\r\n', ' for showdate in dates:\r\n', ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n', ' if len(entries)>=SHOWAMOUNT:\r\n', ' break\r\n', ' \r\n', ] stream = StringIO.StringIO("".join(s).encode(self.encoding)) reader = codecs.getreader(self.encoding)(stream) for (i, line) in enumerate(reader): self.assertEqual(line, s[i])
def getreader(input): stream = StringIO.StringIO(input.encode(self.encoding)) return codecs.getreader(self.encoding)(stream)
def read_source_sentences(inference_input_file): """Load inference data.""" with codecs.getreader("utf-8")( tf.io.gfile.GFile(inference_input_file, mode="rb")) as f: inference_data = f.read().splitlines() return inference_data
iso_3166_1_url = os.environ.get( "ISO_3166_1_URL", "http://anonscm.debian.org/cgit/pkg-isocodes/iso-codes.git/plain/data/iso_3166-1.json" ) iso_639_3_url = os.environ.get( "ISO_639_3_URL", "http://anonscm.debian.org/cgit/pkg-isocodes/iso-codes.git/plain/data/iso_639-3.json" ) langs = set() countries = set() # country codes (2 letters) with urlopen(iso_3166_1_url) as f: data = json.load(codecs.getreader("utf-8")(f)) for entry in data["3166-1"]: countries.add(entry["alpha_2"]) # language codes (2 or 3 letters, 3 only for ones we don't have 2-letter one) with urlopen(iso_639_3_url) as f: data = json.load(codecs.getreader("utf-8")(f)) for entry in data["639-3"]: langs.add(entry.get("alpha_2") or entry["alpha_3"]) # Note that we are not pprint()ing the set directly because with # Python 3 it results in curly brace set initializers that are not # compatible with Python 2.6, do it with set([...]) instead. print("# Generated with %s" % sys.argv[0]) print("")
def multi_worker_inference(infer_model, ckpt, inference_input_file, inference_output_file, hparams, num_workers, jobid): """Inference using multiple workers.""" assert num_workers > 1 final_output_infer = inference_output_file output_infer = "%s_%d" % (inference_output_file, jobid) output_infer_done = "%s_done_%d" % (inference_output_file, jobid) # Read data infer_data = load_data(inference_input_file, hparams) # Split data to multiple workers total_load = len(infer_data) load_per_worker = int((total_load - 1) / num_workers) + 1 start_position = jobid * load_per_worker end_position = min(start_position + load_per_worker, total_load) infer_data = infer_data[start_position:end_position] with tf.Session(graph=infer_model.graph, config=utils.get_config_proto()) as sess: loaded_infer_model = model_helper.load_model(infer_model.model, ckpt, sess, "infer") sess.run( infer_model.iterator.initializer, { infer_model.src_placeholder: infer_data, infer_model.batch_size_placeholder: hparams.infer_batch_size }) # Decode utils.print_out("# Start decoding") nmt_utils.decode_and_evaluate( "infer", loaded_infer_model, sess, output_infer, ref_file=None, metrics=hparams.metrics, subword_option=hparams.subword_option, beam_width=hparams.beam_width, tgt_eos=hparams.eos, num_translations_per_input=hparams.num_translations_per_input) # Change file name to indicate the file writing is completed. tf.gfile.Rename(output_infer, output_infer_done, overwrite=True) # Job 0 is responsible for the clean up. if jobid != 0: return # Now write all translations with codecs.getwriter("utf-8")(tf.gfile.GFile(final_output_infer, mode="wb")) as final_f: for worker_id in range(num_workers): worker_infer_done = "%s_done_%d" % (inference_output_file, worker_id) while not tf.gfile.Exists(worker_infer_done): utils.print_out(" waitting job %d to complete." % worker_id) time.sleep(10) with codecs.getreader("utf-8")(tf.gfile.GFile( worker_infer_done, mode="rb")) as f: for translation in f: final_f.write("%s" % translation) for worker_id in range(num_workers): worker_infer_done = "%s_done_%d" % (inference_output_file, worker_id) tf.gfile.Remove(worker_infer_done)
# -*- coding: utf-8 -*- import codecs import sys from regex import Regex reload(sys) sys.setdefaultencoding('utf-8') sys.stdin = codecs.getreader('UTF-8')(sys.stdin) sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) def is_ascii(s): return all(ord(c) < 128 for c in s) def replacement(line, symbol=".", repl="。"): line = line.replace(". . .", ".") line = line.replace(u".", ".") # line = line.replace("...", ".") dot = Regex(r'(\S\s*)\%s(\s*\S*)' % symbol) m = dot.findall(line) if m: # print "BEFORE:", line # print m for ele in m: b_char = ele[0].strip() a_char = ele[1].strip() # consecutive dot avoid if symbol != b_char and symbol != a_char: # both are digit or are letters if is_ascii(b_char) and is_ascii(a_char):
from past.builtins import basestring from pkg_resources import resource_stream from pyramid.events import ( ApplicationCreated, subscriber, ) from pyramid.httpexceptions import HTTPNotFound from pyramid.view import view_config import codecs import json utf8 = codecs.getreader("utf-8") jsonld_base = 'https://www.encodeproject.org/terms/' prefix = 'encode:' term_path = '/terms/' def aslist(value): if isinstance(value, basestring): return [value] return value @subscriber(ApplicationCreated) def make_jsonld_context(event): app = event.app root = app.root_factory(app) context = { 'encode': jsonld_base, '@base': jsonld_base,
def __init__(self, f, encoding): self.reader = codecs.getreader(encoding)(f, errors='ignore')
def merge(self, path): reader = codecs.getreader("utf-8") with open(path, 'rb') as fh: extra = json.load(reader(fh)) self.create_data(extra)
def setUp(self): self.reader = codecs.getreader('utf-8') self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
def test_stream(self): import StringIO r = codecs.getreader("idna")(StringIO.StringIO("abc")) r.read(3) self.assertEquals(r.read(), u"")
def __init__(self, filename): re_data = re.compile(r'^\\data\\') re_ngram_count = re.compile(r'^ngram (\d+)=(\d+)') re_ngrams = re.compile(r'^\\(\d+)-grams:') re_end = re.compile(r'^\\end\\') ngram_counts = [] current_ngram = 0 ngrams = {} def found_data_section(line:str): match_object = re_data.search(line) if match_object is not None: return True return False def found_ngrams_section(line:str): nonlocal current_ngram match_object = re_ngrams.search(line) if match_object is not None: current_ngram = int(match_object.group(1)) return True return False def found_ngram_counts(line:str): nonlocal ngram_counts match_object = re_ngram_count.search(line) if match_object is not None: ngram_size = int(match_object.group(1)) ngram_counts.append(int(match_object.group(2))) assert (len(ngram_counts) == ngram_size) def found_end_section(line: str): match_object = re_end.search(line) if match_object is not None: return True return False def mmap_gzippd(filename): handle = open(filename, 'rb') mapped = mmap.mmap(handle.fileno(), 0, access=mmap.ACCESS_READ) gzfile = gzip.GzipFile(mode="r", fileobj=mapped) return gzfile def record_ngram(line: str): nonlocal ngram_counts nonlocal current_ngram nonlocal ngrams parts = line.lower().split() if len(parts) == 0: return # count down from the number of expected ngrams ngram_counts[current_ngram - 1] -= 1 ngram_score = float(parts[0]) ngram = parts[1:current_ngram + 1] backoff_score = float(parts[current_ngram + 1] if len(parts) > current_ngram + 1 else 0) ngrams[tuple(ngram)] = (ngram_score, backoff_score) found_end = False with codecs.getreader('UTF-8')(mmap_gzippd(filename)) as f: # ignore the header, looking for start of data for line in f: if found_data_section(line): break # parse the header for line in f: if found_ngrams_section(line): break elif found_ngram_counts(line): pass assert (current_ngram == 1) # parse the ngram data for line in f: # handle start of new section if found_ngrams_section(line): continue # are we done? if found_end_section(line): found_end = True break record_ngram(line) # sanity checks. did we find hte end? did we read the expected number of ngrams? assert (found_end) for i in ngram_counts: assert(i == 0) self.max_ngram = len(ngram_counts) self.ngrams = ngrams
def __init__(self, f): self.reader = codecs.getreader('utf-8')(f)
# for Python 3.x try: reload except NameError: try: from importlib import reload except ImportError: from imp import reload reload(sys) try: sys.setdefaultencoding(cset) except AttributeError: pass sys.stdin = codecs.getreader(cset)(sys.stdin) sys.stdout = codecs.getwriter(cset)(sys.stdout) class PyExecUtil(object): def __init__(self, cmd): self.cmd = cmd self._process = None self._thread = None self._callback = None self._args = None self.stdout_data = None self.stderr_data = None def onCompletion(self): if self._callback: self._callback(self._args, self.stdout_data, self.stderr_data)
def read(self, hdfs_path, offset=0, length=None, buffer_size=None, encoding=None, chunk_size=0, delimiter=None, progress=None): """Read a file from HDFS. :param hdfs_path: HDFS path. :param offset: Starting byte position. :param length: Number of bytes to be processed. `None` will read the entire file. :param buffer_size: Size of the buffer in bytes used for transferring the data. Defaults the the value set in the HDFS configuration. :param encoding: Encoding used to decode the request. By default the raw data is returned. This is mostly helpful in python 3, for example to deserialize JSON data (as the decoder expects unicode). :param chunk_size: If set to a positive number, the context manager will return a generator yielding every `chunk_size` bytes instead of a file-like object (unless `delimiter` is also set, see below). :param delimiter: If set, the context manager will return a generator yielding each time the delimiter is encountered. This parameter requires the `encoding` to be specified. :param progress: Callback function to track progress, called every `chunk_size` bytes (not available if the chunk size isn't specified). It will be passed two arguments, the path to the file being uploaded and the number of bytes transferred so far. On completion, it will be called once with `-1` as second argument. This method must be called using a `with` block: .. code-block:: python with client.read('foo') as reader: content = reader.read() This ensures that connections are always properly closed. .. note:: The raw file-like object returned by this method (when called without an encoding, chunk size, or delimiter) can have a very different performance profile than local files. In particular, line-oriented methods are often slower. The recommended workaround is to specify an encoding when possible or read the entire file before splitting it. """ if chunk_size < 0: raise ValueError('Read chunk size must be non-negative.') if progress and not chunk_size: raise ValueError( 'Progress callback requires a positive chunk size.') if delimiter: if not encoding: raise ValueError('Delimiter splitting requires an encoding.') if chunk_size: raise ValueError( 'Delimiter splitting incompatible with chunk size.') _logger.info('Reading file %r.', hdfs_path) res = self._open( hdfs_path, offset=offset, length=length, buffersize=buffer_size, ) try: if not chunk_size and not delimiter: yield codecs.getreader(encoding)( res.raw) if encoding else res.raw else: # Patch in encoding on the response object so that `iter_content` and # `iter_lines` can pick it up. If `None`, it is ignored and no decoding # happens (which is why we can always set `decode_unicode=True`). res.encoding = encoding if delimiter: data = res.iter_lines(delimiter=delimiter, decode_unicode=True) else: data = res.iter_content(chunk_size=chunk_size, decode_unicode=True) if progress: def reader(_hdfs_path, _progress): """Generator that tracks progress.""" nbytes = 0 for chunk in data: nbytes += len(chunk) _progress(_hdfs_path, nbytes) yield chunk _progress(_hdfs_path, -1) yield reader(hdfs_path, progress) else: yield data finally: res.close() _logger.debug('Closed response for reading file %r.', hdfs_path)
def utfopen(filename): return codecs.getreader('utf-8')(open(filename))
def load(fp, encoding='utf-8'): return json.load(codecs.getreader(encoding)(fp))
def _spider(url, collect_nested): """Fetches URL and any pages it links to. Prints out a warning only if the root can't be fetched; it ignores errors with pages that the root links to. Args: url (str): url being fetched and searched for links collect_nested (bool): whether we want to collect arguments for nested spidering on the links found in this url Returns: A tuple of: - pages: dict of pages visited (URL) mapped to their full text. - links: set of links encountered while visiting the pages. - spider_args: argument for subsequent call to spider """ pages = {} # dict from page URL -> text content. links = set() # set of all links seen on visited pages. subcalls = [] try: response_url, _, response = read_from_url(url, 'text/html') if not response_url or not response: return pages, links, subcalls page = codecs.getreader('utf-8')(response).read() pages[response_url] = page # Parse out the links in the page link_parser = LinkParser() link_parser.feed(page) while link_parser.links: raw_link = link_parser.links.pop() abs_link = url_util.join(response_url, raw_link.strip(), resolve_href=True) links.add(abs_link) # Skip stuff that looks like an archive if any(raw_link.endswith(s) for s in ALLOWED_ARCHIVE_TYPES): continue # Skip already-visited links if abs_link in _visited: continue # If we're not at max depth, follow links. if collect_nested: subcalls.append((abs_link, )) _visited.add(abs_link) except URLError as e: tty.debug(str(e)) if hasattr(e, 'reason') and isinstance(e.reason, ssl.SSLError): tty.warn("Spack was unable to fetch url list due to a " "certificate verification problem. You can try " "running spack -k, which will not check SSL " "certificates. Use this at your own risk.") except HTMLParseError as e: # This error indicates that Python's HTML parser sucks. msg = "Got an error parsing HTML." # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing. if sys.version_info[:3] < (2, 7, 3): msg += " Use Python 2.7.3 or newer for better HTML parsing." tty.warn(msg, url, "HTMLParseError: " + str(e)) except Exception as e: # Other types of errors are completely ignored, # except in debug mode tty.debug("Error in _spider: %s:%s" % (type(e), str(e)), traceback.format_exc()) finally: tty.debug("SPIDER: [url={0}]".format(url)) return pages, links, subcalls
def main(**args): ''' This corresponds to the |rstlisttable| shell command. :param args: Keyword arguments. If empty the arguments are taken from ``sys.argv``. ``rstfile`` is the file name ``in_place`` defaults to False ``join`` defaults to "012" ''' import argparse import codecs import sys if not args: parser = argparse.ArgumentParser( description='''Convert RST grid tables to list-tables.''') parser.add_argument('rstfile', type=argparse.FileType('r', encoding='utf-8'), nargs='+', help='RST file(s)') parser.add_argument( '-j', '--join', action='store', default='012', help= '''e.g.002. Join method per column: 0="".join; 1=" ".join; 2="\\n".join''' ) parser.add_argument('-i', '--in-place', action='store_true', default=False, help='''change the file itself''') args = parser.parse_args().__dict__ if not 'in_place' in args: args['in_place'] = False if not 'join' in args: args['join'] = '012' if isinstance(args['rstfile'], str): args['rstfile'] = [ argparse.FileType('r', encoding='utf-8')(args['rstfile']) ] for infile in args['rstfile']: data = infile.readlines() infile.close() if args['in_place']: f = open(infile.name, 'w', encoding='utf-8', newline='\n') else: # '≥'.encode('cp1252') # UnicodeEncodeError on Windows, therefore... makes problems with pdb, though sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach()) f = sys.stdout try: f.writelines(gridtable(data, args['join'])) finally: if args['in_place']: f.close()
def test_readline(self): sin = "\x80".encode("base64_codec") reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin)) sout = reader.readline() self.assertEqual(sout, "\x80") self.assert_(isinstance(sout, str))
def get_reader_from_stdin(): """ get a utf-8 stream reader for stdin @return: stdin-stream """ return codecs.getreader('utf-8')(sys.stdin)
def convertFile(self, input=None, output=None, encoding=None): """Converts a markdown file and returns the HTML as a unicode string. Decodes the file using the provided encoding (defaults to utf-8), passes the file content to markdown, and outputs the html to either the provided stream or the file with provided name, using the same encoding as the source file. The 'xmlcharrefreplace' error handler is used when encoding the output. **Note:** This is the only place that decoding and encoding of unicode takes place in Python-Markdown. (All other code is unicode-in / unicode-out.) Keyword arguments: * input: File object or path. Reads from stdin if `None`. * output: File object or path. Writes to stdout if `None`. * encoding: Encoding of input and output files. Defaults to utf-8. """ encoding = encoding or "utf-8" # Read the source if input: if isinstance(input, util.string_type): input_file = codecs.open(input, mode="r", encoding=encoding) else: input_file = codecs.getreader(encoding)(input) text = input_file.read() input_file.close() else: text = sys.stdin.read() if not isinstance(text, util.text_type): # pragma: no cover text = text.decode(encoding) text = text.lstrip('\ufeff') # remove the byte-order mark # Convert html = self.convert(text) # Write to file or stdout if output: if isinstance(output, util.string_type): output_file = codecs.open(output, "w", encoding=encoding, errors="xmlcharrefreplace") output_file.write(html) output_file.close() else: writer = codecs.getwriter(encoding) output_file = writer(output, errors="xmlcharrefreplace") output_file.write(html) # Don't close here. User may want to write more. else: # Encode manually and write bytes to stdout. html = html.encode(encoding, "xmlcharrefreplace") try: # Write bytes directly to buffer (Python 3). sys.stdout.buffer.write(html) except AttributeError: # pragma: no cover # Probably Python 2, which works with bytes by default. sys.stdout.write(html) return self
def __init__(self, f, encoding): self.reader = codecs.getreader(encoding)(f)
def install(self, paths, maker, **kwargs): """ Install a wheel to the specified paths. If kwarg ``warner`` is specified, it should be a callable, which will be called with two tuples indicating the wheel version of this software and the wheel version in the file, if there is a discrepancy in the versions. This can be used to issue any warnings to raise any exceptions. If kwarg ``lib_only`` is True, only the purelib/platlib files are installed, and the headers, scripts, data and dist-info metadata are not written. The return value is a :class:`InstalledDistribution` instance unless ``options.lib_only`` is True, in which case the return value is ``None``. """ dry_run = maker.dry_run warner = kwargs.get('warner') lib_only = kwargs.get('lib_only', False) pathname = os.path.join(self.dirname, self.filename) name_ver = '%s-%s' % (self.name, self.version) data_dir = '%s.data' % name_ver info_dir = '%s.dist-info' % name_ver metadata_name = posixpath.join(info_dir, METADATA_FILENAME) wheel_metadata_name = posixpath.join(info_dir, 'WHEEL') record_name = posixpath.join(info_dir, 'RECORD') wrapper = codecs.getreader('utf-8') with ZipFile(pathname, 'r') as zf: with zf.open(wheel_metadata_name) as bwf: wf = wrapper(bwf) message = message_from_file(wf) wv = message['Wheel-Version'].split('.', 1) file_version = tuple([int(i) for i in wv]) if (file_version != self.wheel_version) and warner: warner(self.wheel_version, file_version) if message['Root-Is-Purelib'] == 'true': libdir = paths['purelib'] else: libdir = paths['platlib'] records = {} with zf.open(record_name) as bf: with CSVReader(stream=bf) as reader: for row in reader: p = row[0] records[p] = row data_pfx = posixpath.join(data_dir, '') info_pfx = posixpath.join(info_dir, '') script_pfx = posixpath.join(data_dir, 'scripts', '') # make a new instance rather than a copy of maker's, # as we mutate it fileop = FileOperator(dry_run=dry_run) fileop.record = True # so we can rollback if needed bc = not sys.dont_write_bytecode # Double negatives. Lovely! outfiles = [] # for RECORD writing # for script copying/shebang processing workdir = tempfile.mkdtemp() # set target dir later # we default add_launchers to False, as the # Python Launcher should be used instead maker.source_dir = workdir maker.target_dir = None try: for zinfo in zf.infolist(): arcname = zinfo.filename if isinstance(arcname, text_type): u_arcname = arcname else: u_arcname = arcname.decode('utf-8') # The signature file won't be in RECORD, # and we don't currently don't do anything with it if u_arcname.endswith('/RECORD.jws'): continue row = records[u_arcname] if row[2] and str(zinfo.file_size) != row[2]: raise DistlibException('size mismatch for ' '%s' % u_arcname) if row[1]: kind, value = row[1].split('=', 1) with zf.open(arcname) as bf: data = bf.read() _, digest = self.get_hash(data, kind) if digest != value: raise DistlibException('digest mismatch for ' '%s' % arcname) if lib_only and u_arcname.startswith((info_pfx, data_pfx)): logger.debug('lib_only: skipping %s', u_arcname) continue is_script = (u_arcname.startswith(script_pfx) and not u_arcname.endswith('.exe')) if u_arcname.startswith(data_pfx): _, where, rp = u_arcname.split('/', 2) outfile = os.path.join(paths[where], convert_path(rp)) else: # meant for site-packages. if u_arcname in (wheel_metadata_name, record_name): continue outfile = os.path.join(libdir, convert_path(u_arcname)) if not is_script: with zf.open(arcname) as bf: fileop.copy_stream(bf, outfile) outfiles.append(outfile) # Double check the digest of the written file if not dry_run and row[1]: with open(outfile, 'rb') as bf: data = bf.read() _, newdigest = self.get_hash(data, kind) if newdigest != digest: raise DistlibException('digest mismatch ' 'on write for ' '%s' % outfile) if bc and outfile.endswith('.py'): try: pyc = fileop.byte_compile(outfile) outfiles.append(pyc) except Exception: # Don't give up if byte-compilation fails, # but log it and perhaps warn the user logger.warning('Byte-compilation failed', exc_info=True) else: fn = os.path.basename(convert_path(arcname)) workname = os.path.join(workdir, fn) with zf.open(arcname) as bf: fileop.copy_stream(bf, workname) dn, fn = os.path.split(outfile) maker.target_dir = dn filenames = maker.make(fn) fileop.set_executable_mode(filenames) outfiles.extend(filenames) if lib_only: logger.debug('lib_only: returning None') dist = None else: # Generate scripts # Try to get pydist.json so we can see if there are # any commands to generate. If this fails (e.g. because # of a legacy wheel), log a warning but don't give up. commands = None file_version = self.info['Wheel-Version'] if file_version == '1.0': # Use legacy info ep = posixpath.join(info_dir, 'entry_points.txt') try: with zf.open(ep) as bwf: epdata = read_exports(bwf) commands = {} for key in ('console', 'gui'): k = '%s_scripts' % key if k in epdata: commands['wrap_%s' % key] = d = {} for v in epdata[k].values(): s = '%s:%s' % (v.prefix, v.suffix) if v.flags: s += ' %s' % v.flags d[v.name] = s except Exception: logger.warning('Unable to read legacy script ' 'metadata, so cannot generate ' 'scripts') else: try: with zf.open(metadata_name) as bwf: wf = wrapper(bwf) commands = json.load(wf).get('extensions') if commands: commands = commands.get('python.commands') except Exception: logger.warning('Unable to read JSON metadata, so ' 'cannot generate scripts') if commands: console_scripts = commands.get('wrap_console', {}) gui_scripts = commands.get('wrap_gui', {}) if console_scripts or gui_scripts: script_dir = paths.get('scripts', '') if not os.path.isdir(script_dir): raise ValueError('Valid script path not ' 'specified') maker.target_dir = script_dir for k, v in console_scripts.items(): script = '%s = %s' % (k, v) filenames = maker.make(script) fileop.set_executable_mode(filenames) if gui_scripts: options = {'gui': True} for k, v in gui_scripts.items(): script = '%s = %s' % (k, v) filenames = maker.make(script, options) fileop.set_executable_mode(filenames) p = os.path.join(libdir, info_dir) dist = InstalledDistribution(p) # Write SHARED paths = dict(paths) # don't change passed in dict del paths['purelib'] del paths['platlib'] paths['lib'] = libdir p = dist.write_shared_locations(paths, dry_run) if p: outfiles.append(p) # Write RECORD dist.write_installed_files(outfiles, paths['prefix'], dry_run) return dist except Exception: # pragma: no cover logger.exception('installation failed.') fileop.rollback() raise finally: shutil.rmtree(workdir)
def main(): import sys import os from optparse import OptionParser logging.basicConfig() parser = OptionParser() parser.add_option("-o", "--outfile", dest="outfile", help="name of the object file", metavar="FILE") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="print status messages") parser.add_option("--debug", action="store_true", dest="debug", default=False, help="print debug messages to stdout") parser.add_option("-D", "--define", action="append", dest="defines", metavar="SYM[=VALUE]", default=[], help="define symbol") parser.add_option( "-I", "--include-path", action="append", dest="include_paths", metavar="PATH", default=[], help="Add directory to the search path list for includes") (options, args) = parser.parse_args() if len(args) > 1: sys.stderr.write("Only one file at a time allowed.\n") sys.exit(1) if options.debug: logging.getLogger('cpp').setLevel(logging.DEBUG) elif options.verbose: logging.getLogger('cpp').setLevel(logging.INFO) else: logging.getLogger('cpp').setLevel(logging.WARN) if options.outfile: outfile = codecs.open(options.outfile, 'w', 'utf-8') else: outfile = codecs.getwriter("utf-8")(sys.stdout) cpp = msp430.asm.cpp.Preprocessor() # extend include search path # built in places for msp430.asm d = os.path.join(os.path.dirname(sys.modules['msp430.asm'].__file__), 'include') cpp.include_path.append(d) cpp.include_path.append(os.path.join(d, 'upstream')) # user provided directories (-I) cpp.include_path.extend(options.include_paths) # insert predefined symbols (XXX function like macros not yet supported) for definition in options.defines: if '=' in definition: symbol, value = definition.split('=', 1) else: symbol, value = definition, '1' cpp.namespace.defines[symbol] = value if not args or args[0] == '-': infilename = '<stdin>' infile = codecs.getreader("utf-8")(sys.stdin) else: # search include path for files for path in cpp.include_path: infilename = os.path.join(path, args[0]) if os.path.exists(infilename): infile = codecs.open(infilename, 'r', 'utf-8') break else: sys.stderr.write('h2forth: %s: File not found\n' % (infilename, )) sys.exit(1) try: error_found = cpp.preprocess(infile, msp430.asm.cpp.Discard(), infilename) if error_found: sys.exit(1) except msp430.asm.cpp.PreprocessorError, e: sys.stderr.write('%s:%s: %s\n' % (e.filename, e.line, e)) if options.debug: if hasattr(e, 'text'): sys.stderr.write('%s:%s: input line: %r\n' % (e.filename, e.line, e.text)) sys.exit(1)
def decode(self, text_utf8, text_latex, inputenc=None): encoding = 'latex+' + inputenc if inputenc else 'latex' stream = BytesIO(text_latex) reader = codecs.getreader(encoding)(stream) self.assertEqual(text_utf8, reader.read())