Example #1
0
def get_title(url):
    """Fetches the contents of url and extracts (and utf-8 encodes)
       the contents of <title>"""
    if not url or not url.startswith(('http://', 'https://')):
        return None

    try:
        req = Request(url)
        if g.useragent:
            req.add_header('User-Agent', g.useragent)
        opener = urlopen(req, timeout=15)

        # determine the encoding of the response
        for param in opener.info().getplist():
            if param.startswith("charset="):
                param_name, sep, charset = param.partition("=")
                codec = codecs.getreader(charset)
                break
        else:
            codec = codecs.getreader("utf-8")

        with codec(opener, "ignore") as reader:
            # Attempt to find the title in the first 1kb
            data = reader.read(1024)
            title = extract_title(data)

            # Title not found in the first kb, try searching an additional 10kb
            if not title:
                data += reader.read(10240)
                title = extract_title(data)

        return title

    except:
        return None
def get_vcf_handle(fsock=None, infile=None):
    """Open the vcf file and return a handle"""

    vcf = None
    if (fsock or infile):
    
        if fsock:
            # if not infile and hasattr(fsock, 'name'):
            logger.info("Reading vcf form stdin")
            if sys.version_info < (3, 0):
                logger.info("Using codecs to read stdin")
                sys.stdin = getreader('utf-8')(fsock)
            
            vcf = sys.stdin
        
        else:
            logger.info("Reading vcf from file {0}".format(infile))
            file_name, file_extension = os.path.splitext(infile)
            if file_extension == '.gz':
                logger.debug("Vcf is zipped")
                vcf = getreader('utf-8')(gzip.open(infile), errors='replace')
            elif file_extension == '.vcf':
                vcf = open(infile, mode='r', encoding='utf-8', errors='replace')
            else:
                raise IOError("File is not in a supported format!\n"
                                " Or use correct ending(.vcf or .vcf.gz)")
    else:
        raise IOError("Please provide a fsock or infile")
    
    return vcf
Example #3
0
    def _make_tempfile(self):
        transfer_encoding = self.headers.get('content-transfer-encoding',
            '').lower()
        tf = NamedTemporaryFile()
        start_pos = self._pos + self._headers_length + 2
        file_length = (self._endpos - 2) - start_pos
        bytes_read = 0

        self._data.seek(start_pos)

        while bytes_read < file_length:
            remaining_bytes = (self._endpos - 2) - self._data.tell()
            chunk_size = min(8196, remaining_bytes)
            tf.write(self._data.read(chunk_size))
            bytes_read += chunk_size

        tf.seek(0)

        if transfer_encoding not in ('', '7bit', '8bit', 'binary'):
            decoded_tf = NamedTemporaryFile()
            mimetools.decode(tf, decoded_tf, transfer_encoding)
            try:
                return codecs.getreader(self.charset)(decoded_tf)
            except (TypeError, LookupError):
                return decoded_tf
        else:
            try:
                return codecs.getreader(self.charset)(tf)
            except (TypeError, LookupError):
                return tf
Example #4
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('config', nargs='*', type=argparse.FileType('r'),
                        help="One or more YAML files to read")
    parser.add_argument('--template-file', '-f', dest='template',
                        type=argparse.FileType('r'), default=sys.stdin,
                        help="Config file template. If not supplied, "
                             "stdin is used")
    parser.add_argument('--out', '-o', dest='out',
                        type=argparse.FileType('w'), default=sys.stdout,
                        help="Output file to write. If not supplied, "
                             "stdout is used")
    parser.add_argument('--strict', dest='strict',
                        action='store_true', default=False,
                        help="Raise an exception on undefined variables")

    args = parser.parse_args()

    context = {}
    for file in args.config:
        context.update(yaml.load(getreader('utf-8')(file).read()))

    template_string = getreader('utf-8')(args.template).read()

    rendered = render(template_string, context, args.strict)
    getwriter('utf-8')(args.out).write(rendered)
Example #5
0
def pull(format, stream,kwargs):                        
    if kwargs.get('utf8_cleanup', False): 
        stream = UTF8RecoderWithCleanup(stream, kwargs.get('encoding', 'utf-8'))
    elif codecs.getreader(kwargs.get('encoding', 'utf-8'))  != codecs.getreader('utf-8'):
        stream = UTF8Recoder(stream, kwargs.get('encoding', None))
    else:
        pass

    delimiter = kwargs.get('delimiter', None)
        
    sniff_read = stream.next()
    stream = PrefixReader(sniff_read, stream, linefilter=kwargs.get("linefilter", None))
    dialect = csv.Sniffer().sniff(sniff_read)
    if sniff_read.endswith('\r\n'):
        dialect.lineterminator = '\r\n'
    else:
        dialect.lineterminator = '\n'
    if dialect.delimiter.isalpha() and not delimiter:
        # http://bugs.python.org/issue2078
        for row in  linepull(stream,  dialect, kwargs):
            yield row 
        return 
    if delimiter:
        dialect.delimiter = delimiter
    for row in csvpull(stream,  dialect, kwargs):
        yield row 
Example #6
0
def load_json_file(fname):
    if fname.endswith(".bz2"):
        reader = codecs.getreader("utf-8")(BZ2File(fname))
    else:
        reader = codecs.getreader("utf-8")(open(fname))
    dat = reader.read()
    return json.loads(dat)
Example #7
0
		def test_exceed(self):
			infiles = ['%d.txt' % i for i in range(1, 7)]
			content = {}
			for infile in infiles:
				with open(path_join(self.tmpdir, infile), 'w', encoding = 'ascii') as f:
					content[infile] = ('FILE=%s' % infile)
					f.write(content[infile])
					sleep(0.1)

			for infile in infiles[:5]:
				with self.cache[infile] as entry:
					reader = getreader('ascii')(entry)
					data = reader.read()
					self.assertEqual('TOUCHED\n' + content[infile], data)
					sleep(0.1)
			self.assertEqual(self.count, 5)
			self.assertTrue(len(self.cache) <= 5)

			infile = infiles[5]
			with self.cache[infile] as entry:
				reader = getreader('ascii')(entry)
				data = reader.read()
				self.assertEqual('TOUCHED\n' + content[infile], data)
				sleep(0.1)
			self.assertEqual(self.count, 6)

			# Should cause an entry to be dropped (it blocks while scrubbing)
			infile = infiles[0]
			with self.cache[infile] as entry:
				reader = getreader('ascii')(entry)
				data = reader.read()
				self.assertEqual('TOUCHED\n' + content[infile], data)
				sleep(0.1)
			self.assertEqual(self.count, 7)
			self.assertTrue(len(self.cache) <= 5)
Example #8
0
def run_clang_format_diff(args, file_name):
    try:
        with io.open(file_name, 'r', encoding='utf-8') as f:
            original = f.readlines()
    except IOError as exc:
        raise DiffError(str(exc))
    invocation = [args.clang_format_executable, file_name]
    try:
        proc = subprocess.Popen(
            invocation,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True)
    except OSError as exc:
        raise DiffError(str(exc))
    proc_stdout = proc.stdout
    proc_stderr = proc.stderr
    if sys.version_info[0] < 3:
        # make the pipes compatible with Python 3,
        # reading lines should output unicode
        encoding = 'utf-8'
        proc_stdout = codecs.getreader(encoding)(proc_stdout)
        proc_stderr = codecs.getreader(encoding)(proc_stderr)
    # hopefully the stderr pipe won't get full and block the process
    outs = list(proc_stdout.readlines())
    errs = list(proc_stderr.readlines())
    proc.wait()
    if proc.returncode:
        raise DiffError("clang-format exited with status {}: '{}'".format(
            proc.returncode, file_name), errs)
    return make_diff(file_name, original, outs), errs
Example #9
0
def xopen(filename, mode='r'):
	"""
	Replacement for the "open" function that can also open files that have
	been compressed with gzip or bzip2. If the filename is '-', standard
	output (mode 'w') or input (mode 'r') is returned. If the filename ends
	with .gz, the file is opened with gzip.open(). If it ends with .bz2, it's
	opened as a bz2.BZ2File. Otherwise, the regular open() is used.
	"""
	assert isinstance(filename, basestring)
	if filename == '-':
		return sys.stdin if 'r' in mode else sys.stdout
	if filename.endswith('.bz2'):
		if bz2 is None:
			raise ImportError("Cannot open bz2 files: The bz2 module is not available")
		if sys.version_info[0] < 3:
			return bz2.BZ2File(filename, mode)
		else:
			if 'r' in mode:
				return getreader('ascii')(bz2.BZ2File(filename, mode))
			else:
				return getwriter('ascii')(bz2.BZ2File(filename, mode))
	elif filename.endswith('.gz'):
		if sys.version_info[0] < 3:
			if 'r' in mode:
				return buffered_reader(gzip.open(filename, mode))
			else:
				return buffered_writer(gzip.open(filename, mode))
		else:
			if 'r' in mode:
				return getreader('ascii')(gzip.open(filename, mode))
			else:
				return getwriter('ascii')(gzip.open(filename, mode))
	else:
		return open(filename, mode)
Example #10
0
    def __init__(self, source = '-', output_file = "", enable_hashing = False, output_type = "png", scale = 1.0, width = None, height = None):
        if source == '-':
            source = codecs.getreader('utf-8')(sys.stdin).readlines()
        else:
            file_data = codecs.getreader('utf-8')(open(source, 'r'))
            source = list(file_data)

            
        self.__parsers = []
        self.__backends = []
        self.__source = source
        self.__original_source = copy.copy(source)
        self.__outfile = output_file
        self.__enable_hashing = enable_hashing
        self.__additional_source = str(scale) + str(width) + str(height)
        if not enable_hashing or not hash_check(source + [self.__additional_source], output_file + ".md5"):
            self.register_parser(YamlParser())
            self.register_parser(BackgroundParser())
            self.register_parser(TextParser())
            self.register_parser(OverlayParser())
            self.register_parser(ArrowParser())
            self.register_parser(NameParser())
            self.register_parser(StyleParser())
            backends = {
                    'svg': CairoSvgBackend,
                    'pdf': CairoPdfBackend,
                    'eps': CairoEpsBackend,
                    'png': CairoBackend
                    }
            if output_type in backends:
                self.register_backend(backends[output_type](image_scale = scale, image_width = width, image_height = height))
            else:
                self.register_backend(CairoBackend(image_scale = scale, image_width = width, image_height = height))
Example #11
0
def checkEncoding(fileObj):
    '''
    Check that a file honors the declared encoding (default ASCII for Python 2
    and UTF-8 for Python 3).

    Raises a UnicodeDecodeError in case of decoding problems and LookupError if
    the specified codec does not exists.

    See http://www.python.org/dev/peps/pep-0263/
    '''
    from itertools import islice

    # default encoding
    if sys.version_info[0] <= 2:
        enc = 'ascii'
    else:
        enc = 'utf-8'

    # find the encoding of the file, if specified (in the first two lines)
    enc_exp = re.compile(r"coding[:=]\s*([-\w.]+)")
    for l in islice(fileObj, 2):
        m = enc_exp.search(l)
        if m:
            enc = m.group(1)
            break

    if hasattr(fileObj, 'name'):
        logging.getLogger('checkEncoding').debug('checking encoding %s on %s',
                                                 enc, fileObj.name)
    else:
        logging.getLogger('checkEncoding').debug('checking encoding %s on file object',
                                                 enc)
    # try to read the file with the declared encoding
    fileObj.seek(0)
    codecs.getreader(enc)(fileObj).read()
Example #12
0
    def __init__ (self,
                  langs=[],
                  dictionaryPath="",
                  addPath="",
                  delPath="",
                  full=False,
                  addLang=False,
                  reallyAdd = False):
        self.langs = langs

        self.dictionary = semantics.SemanticDictionary(dictionaryPath)
        self.full = full
        self.addLang = addLang
        self.reallyAdd = reallyAdd

        self.Add = {}
        if addPath:
            f = codecs.getreader("windows-1251")(file(addPath, "rb"))
            for l in f:
                x = l.replace(u"<ana", "@") \
                    .replace("lex=", "") \
                    .replace("gr=", "") \
                    .replace("/>", "") \
                    .replace(">", "") \
                    .replace("\"", " ") \
                    .replace("=", ",") \
                    .rstrip() \
                    .split("@")
                form = x[0].lstrip().rstrip()
                if form not in self.Add:
                    self.Add[form] = []
                for el in x[1:]:
                    s = el.lstrip().rstrip().split()
                    lemma = s[0]
                    gramm = s[1]
                    (head, _, tail) = gramm.partition("(")
                    head = head.split(",")
                    category = head[0]
                    head = set(head)
                    head.discard("")
                    tail = (tail.partition(")")[0]).split("|")
                    res = []
                    for tl in tail:
                        s = set(tl.split(","))
                        s.discard("")
                        res.append(self.createAttrs("", lemma, category, head, s))
                    self.Add[form].append((lemma, res, 'ru', 'disamb'))
            f.close()

        self.Del = set()
        self.DelPatterns = []
        if delPath:
            f = codecs.getreader("windows-1251")(file(delPath, "rb"))
            for l in f:
                x = l.rstrip().split()
                if x[0].endswith("*"):
                    self.DelPatterns.append((x[0][:-1], x[1], set(x[2].split(','))))
                else:
                    self.Del.add(tuple(x[0:3]))
            f.close()
Example #13
0
def download(datafolder):
    #download daily 10 day forecast
    #example url: http://api.wunderground.com/api/944b3f3c879d2394/geolookup/forecast10day/q/Germany/Berlin.json
    for loc in loc_list:
        #get the json object
        f = urllib.request.urlopen('http://api.wunderground.com/api/944b3f3c879d2394/geolookup/forecast10day/q/Germany/'+loc+'.json')
        #need to convert byte object to string
        reader = codecs.getreader('utf-8') #how is data encoded? 
        parsed_json = json.load(reader(f)) 
        fn_hourly = os.path.join(datafolder,"wunderground_" +    time.strftime("%d_%m_%Y_%H_%M_") + loc + "_hourly.pkl")
        with open(fn_hourly, 'wb') as p:
            pickle.dump(parsed_json, p, pickle.HIGHEST_PROTOCOL)
        print (parsed_json['location']['city'] + ' 10 days downloaded')
        time.sleep(10)
        
    #hourly data
    # example url: http://api.wunderground.com/api/944b3f3c879d2394/geolookup/hourly/q/Germany/Berlin.json
        f = urllib.request.urlopen('http://api.wunderground.com/api/944b3f3c879d2394/geolookup/hourly/q/Germany/'+loc+'.json')
        reader = codecs.getreader('utf-8') #how is data encoded? 
        parsed_json = json.load(reader(f)) 
        fn_10days = os.path.join(datafolder,"wunderground_" +    time.strftime("%d_%m_%Y_%H_%M_") + loc + "_10days.pkl")
        with open(fn_10days, 'wb') as p:
            pickle.dump(parsed_json, p, pickle.HIGHEST_PROTOCOL)
        print (parsed_json['location']['city'] + '  hourly downloaded')
        time.sleep(10)
    f.close()
Example #14
0
def xopen(filename, mode='r'):
    """
    Replacement for the "open" function that can also open
    files that have been compressed with gzip. If the filename ends with .gz,
    the file is opened with gzip.open(). If it doesn't, the regular open()
    is used. If the filename is '-', standard output (mode 'w') or input
    (mode 'r') is returned.
    """
    assert isinstance(filename, str)
    if filename == '-':
        return sys.stdin if 'r' in mode else sys.stdout
    if filename.endswith('.gz'):
        if sys.version_info[0] < 3:
            if 'r' in mode:
                return buffered_reader(gzip.open(filename, mode))
            else:
                return gzip.open(filename, mode)
        else:
            if 'r' in mode:
                return getreader('ascii')(gzip.open(filename, mode))
            else:
                return getwriter('ascii')(gzip.open(filename, mode))
    elif filename.endswith('.bz2'):
        if sys.version_info[0] < 3:
            return bz2.BZ2File(filename, mode)
        else:
            if 'r' in mode:
                return getreader('ascii')(bz2.BZ2File(filename, mode))
            else:
                return getwriter('ascii')(bz2.BZ2File(filename, mode))
    else:
        return open(filename, mode)
Example #15
0
def process_args(args):
    if not (args.ml or args.rb):
        args.rb = True
    if args.infile:
        ifp = io.open(args.infile, encoding='utf-8')
    else:
        if sys.version_info[0] >= 3:
            ifp = codecs.getreader('utf8')(sys.stdin.buffer)
        else:
            ifp = codecs.getreader('utf8')(sys.stdin)

    if args.outfile:
        ofp = io.open(args.outfile, mode='w', encoding='utf-8')
    else:
        if sys.version_info[0] >= 3:
            ofp = codecs.getwriter('utf8')(sys.stdout.buffer)
        else:
            ofp = codecs.getwriter('utf8')(sys.stdout)

    # initialize transliterator object
    trn = Transliterator(args.source,
                         args.target,
                         rb=args.rb,
                         build_lookup=args.build_lookup)

    # transliterate text
    for line in ifp:
        tline = trn.convert(line)
        ofp.write(tline)

    # close files
    ifp.close()
    ofp.close()
Example #16
0
def fopen(s, enc="utf-8"):
    """Opens the indicated file, handling special cases including None, "-", "stdin" (indicating stdin),
    and "stderr", indicating stderr.  For files that end in ".gz" or ".bz2", automatically handles
    decompression"""
    if not s or s == '-':
        LOG.info("Returning sys.stdin")
        return sys.stdin
    # Handle http(s):
    if s.startswith('http://') or s.startswith('https://'):
        r = requests.get(s, stream=True)
        return r.raw if enc == 'b' else codecs.getreader(enc)(r.raw)
    fos = []
    fnames = glob.glob(s)
    if not fnames:
        raise IOError("No such file: %s" % s)
    for f in fnames:
        ext = f.rsplit(".", 1)[-1]
        if ext == "bz2":
            fo = bz2.BZ2File(f, 'r', 10*1024)
        elif ext == "gz":
            fo = gzip.open(f, 'rb')
        else:
            fo = open(f, 'rb') # Encoding handled below
        fos = itertools.chain(fos, fo) if len(fnames) > 1 else fo

    # Wrap the raw file handle into one that can decode
    # Wikipedia needs this
    return fos if enc == 'b' else codecs.getreader(enc)(fos)
    def initialize(self, arg):
        self.clean_words = set()

        logger.info('Reading in clean words...')

        reader = codecs.getreader('utf8')(BZ2File(CleanWordsFile))
        for line in reader.readlines():
            (word,doc_count,_) = line.split('\t')
            doc_count = int(doc_count)
            if word and doc_count > MinVocabDocThreshold:
                self.clean_words.add(word)
        reader.close()

        logger.info('done.')

        # Read in document link weights
        self.clean_docs = set()

        logger.info('Reading in clean docs...')

        reader = codecs.getreader('utf8')(BZ2File(DocumentLinksFile))
        for line in reader.readlines():
            (doc,incoming,outgoing) = line.split('\t')
            incoming = int(incoming)
            outgoing = int(outgoing)
            if doc and incoming >= MinIncomingLinkWeight:
                self.clean_docs.add(doc)
        reader.close()

        logger.info('done.')
Example #18
0
 def wrap_fp(fp):
     if suffix == ".gz":
         fp = GzipFile(fileobj=fp, mode=mode)
     elif suffix == ".bz2":
         try:
             fp = BZ2File(fp, mode=mode)
         except TypeError:
             if sys.version_info < (3, 0, 0):
                 raise NotImplementedError("built-in BZ2File is partially broken in python 2, install bz2file from pypi or use a compression setting other than 'bz2'")
             else:
                 raise
     elif suffix == ".xz":
         fp = LZMAFile(fp, mode=mode)
     if (suffix or sys.version_info < (3,)) and "b" not in mode:
         # If mode is not binary (and we expect to be able to
         # write() str values, not bytes), need need to create
         # an additional encoding wrapper. That encoder can
         # probably use UTF-8 without any need for additional
         # configuration
         if "r" in mode and "w" in mode:
             fp = StreamReaderWriter(fp, codecs.getreader("utf-8"),
                                     codecs.getwriter("utf-8"))
         elif "w" in mode:
             fp = codecs.getwriter("utf-8")(fp)
         elif suffix:
             fp = codecs.getreader("utf-8")(fp)
     fp.realname = filename
     return fp
Example #19
0
def _bleu(ref_file, trans_file, subword_option=None):
  """Compute BLEU scores and handling BPE."""
  max_order = 4
  smooth = False

  ref_files = [ref_file]
  reference_text = []
  for reference_filename in ref_files:
    with codecs.getreader("utf-8")(
        tf.gfile.GFile(reference_filename, "rb")) as fh:
      reference_text.append(fh.readlines())

  per_segment_references = []
  for references in zip(*reference_text):
    reference_list = []
    for reference in references:
      reference = _clean(reference, subword_option)
      reference_list.append(reference.split(" "))
    per_segment_references.append(reference_list)

  translations = []
  with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh:
    for line in fh:
      line = _clean(line, subword_option=None)
      translations.append(line.split(" "))

  # bleu_score, precisions, bp, ratio, translation_length, reference_length
  bleu_score, _, _, _, _, _ = bleu.compute_bleu(
      per_segment_references, translations, max_order, smooth)
  return 100 * bleu_score
Example #20
0
    def test_badbom(self):
        s = StringIO.StringIO("\xff\xff")
        f = codecs.getreader(self.encoding)(s)
        self.assertRaises(UnicodeError, f.read)

        s = StringIO.StringIO("\xff\xff\xff\xff")
        f = codecs.getreader(self.encoding)(s)
        self.assertRaises(UnicodeError, f.read)
Example #21
0
def delete_rows(to_delete, inpath, outpath):
    delete = set([el.strip().split(";")[0].replace("/", "\\") for el in codecs.getreader("windows-1251")(file(to_delete, "rb"), 'xmlcharrefreplace').readlines()[1:]])
    f = codecs.getreader("windows-1251")(file(inpath, "rb"), 'xmlcharrefreplace')
    ff = codecs.getwriter("windows-1251")(file(outpath, "wb"), 'xmlcharrefreplace')
    for el in f:
        if not el.strip().split(";")[0].replace("/", "\\") in delete:
            ff.write(el)
        else:
            print el.strip().split(";")[0].replace("/", "\\")
Example #22
0
def run_clang_format_diff(args, file):
    try:
        with io.open(file, 'r', encoding='utf-8') as f:
            original = f.readlines()
    except IOError as exc:
        raise DiffError(str(exc))
    invocation = [args.clang_format_executable, file]

    # Use of utf-8 to decode the process output.
    #
    # Hopefully, this is the correct thing to do.
    #
    # It's done due to the following assumptions (which may be incorrect):
    # - clang-format will returns the bytes read from the files as-is,
    #   without conversion, and it is already assumed that the files use utf-8.
    # - if the diagnostics were internationalized, they would use utf-8:
    #   > Adding Translations to Clang
    #   >
    #   > Not possible yet!
    #   > Diagnostic strings should be written in UTF-8,
    #   > the client can translate to the relevant code page if needed.
    #   > Each translation completely replaces the format string
    #   > for the diagnostic.
    #   > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation
    #
    # It's not pretty, due to Python 2 & 3 compatibility.
    encoding_py3 = {}
    if sys.version_info[0] >= 3:
        encoding_py3['encoding'] = 'utf-8'

    try:
        proc = subprocess.Popen(
            invocation,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
            **encoding_py3)
    except OSError as exc:
        raise DiffError(str(exc))
    proc_stdout = proc.stdout
    proc_stderr = proc.stderr
    if sys.version_info[0] < 3:
        # make the pipes compatible with Python 3,
        # reading lines should output unicode
        encoding = 'utf-8'
        proc_stdout = codecs.getreader(encoding)(proc_stdout)
        proc_stderr = codecs.getreader(encoding)(proc_stderr)
    # hopefully the stderr pipe won't get full and block the process
    outs = list(proc_stdout.readlines())
    errs = list(proc_stderr.readlines())
    proc.wait()
    if proc.returncode:
        raise DiffError("clang-format exited with status {}: '{}'".format(
            proc.returncode, file), errs)
    return make_diff(file, original, outs), errs
Example #23
0
 def decoded(f):
     bom = f.read(2)
     f.seek(0)
     # Older versions of Zemax wrote plain ascii files.  The
     # output txt file format can be selected in the
     # preferences dialog box.
     if bom == codecs.BOM_UTF16:
         reader = codecs.getreader("utf-16")
     else:
         reader = codecs.getreader("utf-8")
     return reader(f)
Example #24
0
def gmail_csv_unicode_open(fin):
    'only supports utf-16 and utf-8'
    # byte order mark
    t = fin.read(2)
    if t == codecs.BOM_UTF16_LE:
        return UTF8Lines(codecs.getreader('utf-16le')(fin))
    elif t == codecs.BOM_UTF16_BE:
        return UTF8Lines(codecs.getreader('utf-16be')(fin))
    else: # assume utf-8
        l = fin.readline()
        return itertools.chain([ t + l ], fin)
Example #25
0
 def __enter__(self):
     try:
         with gzip.open(self.filename, 'rb') as handle:
             with codecs.getreader('utf-8')(handle) as reader:
                 reader.read(50)
         self._gzHandle = gzip.open(self.filename, 'rb')
         self._reader = codecs.getreader('utf-8')(self._gzHandle)
     except IOError:
         self._gzHandle = None
         self._reader = codecs.getreader('utf-8')(open(self.filename))
     return self._reader
Example #26
0
    def __init__(self, host=PUDB_RDB_HOST, port=PUDB_RDB_PORT,
                 port_search_limit=100, out=sys.stdout, term_size=None):
        self.active = True
        self.out = out

        self._prev_handles = sys.stdin, sys.stdout

        self._sock, this_port = self.get_avail_port(
            host, port, port_search_limit)
        self._sock.setblocking(1)
        self._sock.listen(1)
        self.ident = '{0}:{1}'.format(self.me, this_port)
        self.host = host
        self.port = this_port
        self.say(BANNER.format(self=self))

        self._client, address = self._sock.accept()
        self._client.setblocking(1)
        self.remote_addr = ':'.join(str(v) for v in address)
        self.say(SESSION_STARTED.format(self=self))

        # makefile ignores encoding if there's no buffering.
        raw_sock_file = self._client.makefile("rwb", 0)
        import codecs

        if sys.version_info[0] < 3:
            sock_file = codecs.StreamRecoder(
                raw_sock_file,
                codecs.getencoder("utf-8"),
                codecs.getdecoder("utf-8"),
                codecs.getreader("utf-8"),
                codecs.getwriter("utf-8"))
        else:
            sock_file = codecs.StreamReaderWriter(
                raw_sock_file,
                codecs.getreader("utf-8"),
                codecs.getwriter("utf-8"))

        self._handle = sys.stdin = sys.stdout = sock_file

        import telnetlib as tn

        raw_sock_file.write(tn.IAC + tn.WILL + tn.SGA)
        resp = raw_sock_file.read(3)
        assert resp == tn.IAC + tn.DO + tn.SGA

        raw_sock_file.write(tn.IAC + tn.WILL + tn.ECHO)
        resp = raw_sock_file.read(3)
        assert resp == tn.IAC + tn.DO + tn.ECHO

        Debugger.__init__(self, stdin=self._handle, stdout=self._handle,
                term_size=term_size)
    def end_task(self):
        logger.info('Computing normalization factors...')
        total_bigrams = 0
        for (i, shard) in enumerate(self.shuffle_result_shards):
            logger.info('  processing shard %d' % i)
            f = codecs.getreader('utf8')(BZ2File(shard))
            for (line_no, line) in enumerate(f.readlines()): 
                # print line.encode('utf8','replace'),
                try:
                    (word, word2, freq, doc_freq) = line.split('\t')

                    self.word_freq.setdefault(word, 0)
                    self.word_freq[word] += int(freq)

                    total_bigrams += int(freq)

                except ValueError:
                    logger.info('Line %d of %s is bad.' % (line_no, shard))
            f.close()

        total_words = sum(self.word_freq.values())

        # Now write the output to disk
        logger.info('Writing to disk...')
        writer = codecs.getwriter('utf8')(BZ2File(OutputFile, 'w'))
        for (i, shard) in enumerate(self.shuffle_result_shards):
            logger.info('  processing shard %d' % i)
            f = codecs.getreader('utf8')(BZ2File(shard))
            for (line_no, line) in enumerate(f.readlines()): 
                # print line.encode('utf8','replace'),
                try:
                    (word, word2, co_occurrence_sum, document_freq_sum) = line.split('\t')
                    co_occurrence_sum = int(co_occurrence_sum)
                    document_freq_sum = int(document_freq_sum)

                    try:
                        #pmi = log(co_occurrence_sum) - log(total_bigrams) \
                        #      - log(self.word_freq[word]) - log(self.word_freq[word2]) + 2*log(total_words)
                        # f.write('%s\t%s\t%f\t%d\t%d\n' % (word, word2, pmi, freq, self.document_freq[(word,word2)]))
                        #writer.write('%s\t%s\t%f\t%d\t%d\n' % (word, word2, pmi, co_occurrence_sum, document_freq_sum))
                        writer.write('%s\t%s\t%d\t%d\n' % (word, word2, co_occurrence_sum, document_freq_sum))
                    except KeyError:
                        logger.info('Line %d of %s is bad.' % (line_no, shard))

                except ValueError:
                    logger.info('Line %d of %s is bad.' % (line_no, shard))
            f.close()

        writer.close()
        logger.info('done.')
        sys.exit()
Example #28
0
 def _make_file(self):
     start_pos = self._pos + self._headers_length + 2
     chunkfile = MMapChunk(self._data, start_pos, self._endpos - 2)
     transfer_encoding = self.headers.get('content-transfer-encoding',
         '').lower()
     if transfer_encoding not in ('', '7bit', '8bit', 'binary'):
         try:
             chunkfile = codecs.getreader(transfer_encoding)(chunkfile)
         except (TypeError, LookupError):
             pass
     try:
         return codecs.getreader(self.charset)(chunkfile)
     except (TypeError, LookupError):
         return chunkfile
Example #29
0
def _accuracy(label_file, pred_file):
  """Compute accuracy, each line contains a label."""

  with codecs.getreader("utf-8")(tf.gfile.GFile(label_file, "rb")) as label_fh:
    with codecs.getreader("utf-8")(tf.gfile.GFile(pred_file, "rb")) as pred_fh:
      count = 0.0
      match = 0.0
      for label in label_fh:
        label = label.strip()
        pred = pred_fh.readline().strip()
        if label == pred:
          match += 1
        count += 1
  return 100 * match / count
Example #30
0
def callMethod(method, *params):
    """Call any JSON-RPC method"""
    def base64_enc(auth):
        """
        Jump through python3 hoops to encode base64 string
        """
        return base64.encodebytes(auth.encode('ascii')).decode('ascii').replace('\n', '')

    callId = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))

    request = urllib.request.Request('http://%s:%d' % (
        CONFIG['rpc_host'],
        CONFIG['rpc_port'],
    ), json.dumps(dict(
        jsonrpc="1.0",
        id=callId,
        method=method,
        params=params,
    )).encode('utf8'), {
        "Authorization": "Basic %s" % base64_enc('%s:%s' % (CONFIG['rpc_user'], CONFIG['rpc_pass'])),
        "Content-Type": "application/json",
    })

    # Do the request, parse response
    try:
        response = urllib.request.urlopen(request)
        out = json.load(codecs.getreader('utf8')(response))
    except urllib.request.HTTPError as e:
        try:
            out = json.load(codecs.getreader('utf8')(e))
            if not out['error']:
                out['error'] = dict(message='', code=e.code)
        except ValueError:
            e.seek(0)
            out = dict(id=callId, error=dict(
                message=" ".join([e.msg, e.read().decode('utf8')]),
                code=e.code,
            ))

    if out['id'] != callId:
        raise ValueError("Response ID %s doesn't match %s" % (out['id'], callId))

    if out['error'] is None:
        return out['result']

    if out['error']['message'] == "Invalid Smileycoin address":
        raise ValueError(out['error']['message'])

    raise RuntimeError("%s (%d)" % (out['error']['message'], out['error']['code']))
                    stats[most_frequent]))
        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold)


if __name__ == '__main__':

    # python 2/3 compatibility
    if sys.version_info < (3, 0):
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
    else:
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer)
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer)

    parser = create_parser()
    args = parser.parse_args()

    # read/write files as UTF-8
    if args.input.name != '<stdin>':
        args.input = codecs.open(args.input.name, encoding='utf-8')
    if args.output.name != '<stdout>':
        args.output = codecs.open(args.output.name, 'w', encoding='utf-8')

    main(args.input,
Example #32
0
    def check_graphic(self):
        """
        Check the hash of the current matplotlib figure matches the expected
        image hash for the current graphic test.

        To create missing image test results, set the IRIS_TEST_CREATE_MISSING
        environment variable before running the tests. This will result in new
        and appropriately "<hash>.png" image files being generated in the image
        output directory, and the imagerepo.json file being updated.

        """
        import imagehash
        from PIL import Image

        dev_mode = os.environ.get('IRIS_TEST_CREATE_MISSING')
        unique_id = self._unique_id()
        repo_fname = os.path.join(_RESULT_PATH, 'imagerepo.json')
        with open(repo_fname, 'rb') as fi:
            repo = json.load(codecs.getreader('utf-8')(fi))

        try:
            #: The path where the images generated by the tests should go.
            image_output_directory = os.path.join(os.path.dirname(__file__),
                                                  'result_image_comparison')
            if not os.access(image_output_directory, os.W_OK):
                if not os.access(os.getcwd(), os.W_OK):
                    raise IOError('Write access to a local disk is required '
                                  'to run image tests.  Run the tests from a '
                                  'current working directory you have write '
                                  'access to to avoid this issue.')
                else:
                    image_output_directory = os.path.join(
                        os.getcwd(), 'iris_image_test_output')
            result_fname = os.path.join(image_output_directory,
                                        'result-' + unique_id + '.png')

            if not os.path.isdir(image_output_directory):
                # Handle race-condition where the directories are
                # created sometime between the check above and the
                # creation attempt below.
                try:
                    os.makedirs(image_output_directory)
                except OSError as err:
                    # Don't care about "File exists"
                    if err.errno != 17:
                        raise

            def _create_missing():
                fname = '{}.png'.format(phash)
                base_uri = ('https://scitools.github.io/test-iris-imagehash/'
                            'images/v4/{}')
                uri = base_uri.format(fname)
                hash_fname = os.path.join(image_output_directory, fname)
                uris = repo.setdefault(unique_id, [])
                uris.append(uri)
                print('Creating image file: {}'.format(hash_fname))
                figure.savefig(hash_fname)
                msg = 'Creating imagerepo entry: {} -> {}'
                print(msg.format(unique_id, uri))
                lock = filelock.FileLock(
                    os.path.join(_RESULT_PATH, 'imagerepo.lock'))
                # The imagerepo.json file is a critical resource, so ensure
                # thread safe read/write behaviour via platform independent
                # file locking.
                with lock.acquire(timeout=600):
                    with open(repo_fname, 'wb') as fo:
                        json.dump(repo,
                                  codecs.getwriter('utf-8')(fo),
                                  indent=4,
                                  sort_keys=True)

            # Calculate the test result perceptual image hash.
            buffer = io.BytesIO()
            figure = plt.gcf()
            figure.savefig(buffer, format='png')
            buffer.seek(0)
            phash = imagehash.phash(Image.open(buffer), hash_size=_HASH_SIZE)

            if unique_id not in repo:
                if dev_mode:
                    _create_missing()
                else:
                    figure.savefig(result_fname)
                    emsg = 'Missing image test result: {}.'
                    raise AssertionError(emsg.format(unique_id))
            else:
                uris = repo[unique_id]
                # Extract the hex basename strings from the uris.
                hexes = [
                    os.path.splitext(os.path.basename(uri))[0] for uri in uris
                ]
                # Create the expected perceptual image hashes from the uris.
                to_hash = imagehash.hex_to_hash
                expected = [to_hash(uri_hex) for uri_hex in hexes]

                # Calculate hamming distance vector for the result hash.
                distances = [e - phash for e in expected]

                if np.all([hd > _HAMMING_DISTANCE for hd in distances]):
                    if dev_mode:
                        _create_missing()
                    else:
                        figure.savefig(result_fname)
                        msg = ('Bad phash {} with hamming distance {} '
                               'for test {}.')
                        msg = msg.format(phash, distances, unique_id)
                        if _DISPLAY_FIGURES:
                            emsg = 'Image comparison would have failed: {}'
                            print(emsg.format(msg))
                        else:
                            emsg = 'Image comparison failed: {}'
                            raise AssertionError(emsg.format(msg))

            if _DISPLAY_FIGURES:
                plt.show()

        finally:
            plt.close()
Example #33
0
import re
import os.path
import gzip
import tempfile
import shutil
import atexit

# Use word_tokenize to split raw text into words
from string import punctuation

import nltk
from nltk.tokenize import word_tokenize

scriptdir = os.path.dirname(os.path.abspath(__file__))

reader = codecs.getreader('utf8')
writer = codecs.getwriter('utf8')

def prepfile(fh, code):
  if type(fh) is str:
    fh = open(fh, code)
  ret = gzip.open(fh.name, code if code.endswith("t") else code+"t") if fh.name.endswith(".gz") else fh
  if sys.version_info[0] == 2:
    if code.startswith('r'):
      ret = reader(fh)
    elif code.startswith('w'):
      ret = writer(fh)
    else:
      sys.stderr.write("I didn't understand code "+code+"\n")
      sys.exit(1)
  return ret
Example #34
0
 def reset(self):
     self.dataStream = codecs.getreader(self.charEncoding[0])(
         self.rawStream, 'replace')
     HTMLUnicodeInputStream.reset(self)
Example #35
0
 def test_bug1175396(self):
     s = [
         '<%!--===================================================\r\n',
         '    BLOG index page: show recent articles,\r\n',
         '    today\'s articles, or articles of a specific date.\r\n',
         '========================================================--%>\r\n',
         '<%@inputencoding="ISO-8859-1"%>\r\n',
         '<%@pagetemplate=TEMPLATE.y%>\r\n',
         '<%@import=import frog.util, frog%>\r\n',
         '<%@import=import frog.objects%>\r\n',
         '<%@import=from frog.storageerrors import StorageError%>\r\n',
         '<%\r\n',
         '\r\n',
         'import logging\r\n',
         'log=logging.getLogger("Snakelets.logger")\r\n',
         '\r\n',
         '\r\n',
         'user=self.SessionCtx.user\r\n',
         'storageEngine=self.SessionCtx.storageEngine\r\n',
         '\r\n',
         '\r\n',
         'def readArticlesFromDate(date, count=None):\r\n',
         '    entryids=storageEngine.listBlogEntries(date)\r\n',
         '    entryids.reverse() # descending\r\n',
         '    if count:\r\n',
         '        entryids=entryids[:count]\r\n',
         '    try:\r\n',
         '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
         '    except StorageError,x:\r\n',
         '        log.error("Error loading articles: "+str(x))\r\n',
         '        self.abort("cannot load articles")\r\n',
         '\r\n',
         'showdate=None\r\n',
         '\r\n',
         'arg=self.Request.getArg()\r\n',
         'if arg=="today":\r\n',
         '    #-------------------- TODAY\'S ARTICLES\r\n',
         '    self.write("<h2>Today\'s articles</h2>")\r\n',
         '    showdate = frog.util.isodatestr() \r\n',
         '    entries = readArticlesFromDate(showdate)\r\n',
         'elif arg=="active":\r\n',
         '    #-------------------- ACTIVE ARTICLES redirect\r\n',
         '    self.Yredirect("active.y")\r\n',
         'elif arg=="login":\r\n',
         '    #-------------------- LOGIN PAGE redirect\r\n',
         '    self.Yredirect("login.y")\r\n',
         'elif arg=="date":\r\n',
         '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
         '    showdate = self.Request.getParameter("date")\r\n',
         '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
         '    entries = readArticlesFromDate(showdate)\r\n',
         'else:\r\n',
         '    #-------------------- RECENT ARTICLES\r\n',
         '    self.write("<h2>Recent articles</h2>")\r\n',
         '    dates=storageEngine.listBlogEntryDates()\r\n',
         '    if dates:\r\n',
         '        entries=[]\r\n',
         '        SHOWAMOUNT=10\r\n',
         '        for showdate in dates:\r\n',
         '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
         '            if len(entries)>=SHOWAMOUNT:\r\n',
         '                break\r\n',
         '                \r\n',
     ]
     stream = StringIO.StringIO("".join(s).encode(self.encoding))
     reader = codecs.getreader(self.encoding)(stream)
     for (i, line) in enumerate(reader):
         self.assertEqual(line, s[i])
Example #36
0
 def getreader(input):
     stream = StringIO.StringIO(input.encode(self.encoding))
     return codecs.getreader(self.encoding)(stream)
def read_source_sentences(inference_input_file):
  """Load inference data."""
  with codecs.getreader("utf-8")(
      tf.io.gfile.GFile(inference_input_file, mode="rb")) as f:
    inference_data = f.read().splitlines()
  return inference_data
Example #38
0
iso_3166_1_url = os.environ.get(
    "ISO_3166_1_URL",
    "http://anonscm.debian.org/cgit/pkg-isocodes/iso-codes.git/plain/data/iso_3166-1.json"
)
iso_639_3_url = os.environ.get(
    "ISO_639_3_URL",
    "http://anonscm.debian.org/cgit/pkg-isocodes/iso-codes.git/plain/data/iso_639-3.json"
)

langs = set()
countries = set()

# country codes (2 letters)
with urlopen(iso_3166_1_url) as f:
    data = json.load(codecs.getreader("utf-8")(f))
    for entry in data["3166-1"]:
        countries.add(entry["alpha_2"])

# language codes (2 or 3 letters, 3 only for ones we don't have 2-letter one)
with urlopen(iso_639_3_url) as f:
    data = json.load(codecs.getreader("utf-8")(f))
    for entry in data["639-3"]:
        langs.add(entry.get("alpha_2") or entry["alpha_3"])

# Note that we are not pprint()ing the set directly because with
# Python 3 it results in curly brace set initializers that are not
# compatible with Python 2.6, do it with set([...]) instead.

print("# Generated with %s" % sys.argv[0])
print("")
Example #39
0
def multi_worker_inference(infer_model, ckpt, inference_input_file,
                           inference_output_file, hparams, num_workers, jobid):
    """Inference using multiple workers."""
    assert num_workers > 1

    final_output_infer = inference_output_file
    output_infer = "%s_%d" % (inference_output_file, jobid)
    output_infer_done = "%s_done_%d" % (inference_output_file, jobid)

    # Read data
    infer_data = load_data(inference_input_file, hparams)

    # Split data to multiple workers
    total_load = len(infer_data)
    load_per_worker = int((total_load - 1) / num_workers) + 1
    start_position = jobid * load_per_worker
    end_position = min(start_position + load_per_worker, total_load)
    infer_data = infer_data[start_position:end_position]

    with tf.Session(graph=infer_model.graph,
                    config=utils.get_config_proto()) as sess:
        loaded_infer_model = model_helper.load_model(infer_model.model, ckpt,
                                                     sess, "infer")
        sess.run(
            infer_model.iterator.initializer, {
                infer_model.src_placeholder: infer_data,
                infer_model.batch_size_placeholder: hparams.infer_batch_size
            })
        # Decode
        utils.print_out("# Start decoding")
        nmt_utils.decode_and_evaluate(
            "infer",
            loaded_infer_model,
            sess,
            output_infer,
            ref_file=None,
            metrics=hparams.metrics,
            subword_option=hparams.subword_option,
            beam_width=hparams.beam_width,
            tgt_eos=hparams.eos,
            num_translations_per_input=hparams.num_translations_per_input)

        # Change file name to indicate the file writing is completed.
        tf.gfile.Rename(output_infer, output_infer_done, overwrite=True)

        # Job 0 is responsible for the clean up.
        if jobid != 0: return

        # Now write all translations
        with codecs.getwriter("utf-8")(tf.gfile.GFile(final_output_infer,
                                                      mode="wb")) as final_f:
            for worker_id in range(num_workers):
                worker_infer_done = "%s_done_%d" % (inference_output_file,
                                                    worker_id)
                while not tf.gfile.Exists(worker_infer_done):
                    utils.print_out("  waitting job %d to complete." %
                                    worker_id)
                    time.sleep(10)

                with codecs.getreader("utf-8")(tf.gfile.GFile(
                        worker_infer_done, mode="rb")) as f:
                    for translation in f:
                        final_f.write("%s" % translation)

            for worker_id in range(num_workers):
                worker_infer_done = "%s_done_%d" % (inference_output_file,
                                                    worker_id)
                tf.gfile.Remove(worker_infer_done)
# -*- coding: utf-8 -*-
import codecs
import sys
from regex import Regex
reload(sys)
sys.setdefaultencoding('utf-8')

sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)


def is_ascii(s):
    return all(ord(c) < 128 for c in s)


def replacement(line, symbol=".", repl="。"):
    line = line.replace(". . .", ".")
    line = line.replace(u".", ".")
    # line = line.replace("...", ".")
    dot = Regex(r'(\S\s*)\%s(\s*\S*)' % symbol)
    m = dot.findall(line)
    if m:
        # print "BEFORE:", line
        # print m
        for ele in m:
            b_char = ele[0].strip()
            a_char = ele[1].strip()
            # consecutive dot avoid
            if symbol != b_char and symbol != a_char:
                # both are digit or are letters
                if is_ascii(b_char) and is_ascii(a_char):
Example #41
0
from past.builtins import basestring
from pkg_resources import resource_stream
from pyramid.events import (
    ApplicationCreated,
    subscriber,
)
from pyramid.httpexceptions import HTTPNotFound
from pyramid.view import view_config
import codecs
import json

utf8 = codecs.getreader("utf-8")

jsonld_base = 'https://www.encodeproject.org/terms/'
prefix = 'encode:'
term_path = '/terms/'


def aslist(value):
    if isinstance(value, basestring):
        return [value]
    return value


@subscriber(ApplicationCreated)
def make_jsonld_context(event):
    app = event.app
    root = app.root_factory(app)
    context = {
        'encode': jsonld_base,
        '@base': jsonld_base,
Example #42
0
 def __init__(self, f, encoding):
     self.reader = codecs.getreader(encoding)(f, errors='ignore')
Example #43
0
 def merge(self, path):
     reader = codecs.getreader("utf-8")
     with open(path, 'rb') as fh:
         extra = json.load(reader(fh))
         self.create_data(extra)
Example #44
0
 def setUp(self):
     self.reader = codecs.getreader('utf-8')
     self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
Example #45
0
 def test_stream(self):
     import StringIO
     r = codecs.getreader("idna")(StringIO.StringIO("abc"))
     r.read(3)
     self.assertEquals(r.read(), u"")
    def __init__(self, filename):
        re_data = re.compile(r'^\\data\\')
        re_ngram_count = re.compile(r'^ngram (\d+)=(\d+)')
        re_ngrams = re.compile(r'^\\(\d+)-grams:')
        re_end = re.compile(r'^\\end\\')

        ngram_counts = []
        current_ngram = 0
        ngrams = {}

        def found_data_section(line:str):
            match_object = re_data.search(line)
            if match_object is not None:
                return True
            return False
        def found_ngrams_section(line:str):
            nonlocal current_ngram
            match_object = re_ngrams.search(line)
            if match_object is not None:
                current_ngram = int(match_object.group(1))
                return True
            return False
        def found_ngram_counts(line:str):
            nonlocal ngram_counts
            match_object = re_ngram_count.search(line)
            if match_object is not None:
                ngram_size = int(match_object.group(1))
                ngram_counts.append(int(match_object.group(2)))
                assert (len(ngram_counts) == ngram_size)
        def found_end_section(line: str):
            match_object = re_end.search(line)
            if match_object is not None:
                return True
            return False
        def mmap_gzippd(filename):
            handle = open(filename, 'rb')
            mapped = mmap.mmap(handle.fileno(), 0, access=mmap.ACCESS_READ)
            gzfile = gzip.GzipFile(mode="r", fileobj=mapped)
            return gzfile
        def record_ngram(line: str):
            nonlocal ngram_counts
            nonlocal current_ngram
            nonlocal ngrams

            parts = line.lower().split()
            if len(parts) == 0:
                return

            # count down from the number of expected ngrams
            ngram_counts[current_ngram - 1] -= 1

            ngram_score = float(parts[0])
            ngram = parts[1:current_ngram + 1]
            backoff_score = float(parts[current_ngram + 1] if len(parts) > current_ngram + 1 else 0)

            ngrams[tuple(ngram)] = (ngram_score, backoff_score)

        found_end = False
        with codecs.getreader('UTF-8')(mmap_gzippd(filename)) as f:

            # ignore the header, looking for start of data
            for line in f:
                if found_data_section(line):
                    break

            # parse the header
            for line in f:
                if found_ngrams_section(line):
                    break
                elif found_ngram_counts(line):
                    pass
            assert (current_ngram == 1)

            # parse the ngram data
            for line in f:
                # handle start of new section
                if found_ngrams_section(line):
                    continue
                # are we done?
                if found_end_section(line):
                    found_end = True
                    break
                record_ngram(line)

        # sanity checks. did we find hte end? did we read the expected number of ngrams?
        assert (found_end)
        for i in ngram_counts:
            assert(i == 0)

        self.max_ngram = len(ngram_counts)
        self.ngrams = ngrams
Example #47
0
 def __init__(self, f):
     self.reader = codecs.getreader('utf-8')(f)
Example #48
0
# for Python 3.x
try:
	reload
except NameError:
	try:
		from importlib import reload
	except ImportError:
		from imp import reload

reload(sys)
try:
	sys.setdefaultencoding(cset)
except AttributeError:
	pass

sys.stdin = codecs.getreader(cset)(sys.stdin)
sys.stdout = codecs.getwriter(cset)(sys.stdout)

class PyExecUtil(object):
	def __init__(self, cmd):
		self.cmd = cmd
		self._process = None
		self._thread = None
		self._callback = None
		self._args = None
		self.stdout_data = None
		self.stderr_data = None

	def onCompletion(self):
		if self._callback:
			self._callback(self._args, self.stdout_data, self.stderr_data)
Example #49
0
    def read(self,
             hdfs_path,
             offset=0,
             length=None,
             buffer_size=None,
             encoding=None,
             chunk_size=0,
             delimiter=None,
             progress=None):
        """Read a file from HDFS.

    :param hdfs_path: HDFS path.
    :param offset: Starting byte position.
    :param length: Number of bytes to be processed. `None` will read the entire
      file.
    :param buffer_size: Size of the buffer in bytes used for transferring the
      data. Defaults the the value set in the HDFS configuration.
    :param encoding: Encoding used to decode the request. By default the raw
      data is returned. This is mostly helpful in python 3, for example to
      deserialize JSON data (as the decoder expects unicode).
    :param chunk_size: If set to a positive number, the context manager will
      return a generator yielding every `chunk_size` bytes instead of a
      file-like object (unless `delimiter` is also set, see below).
    :param delimiter: If set, the context manager will return a generator
      yielding each time the delimiter is encountered. This parameter requires
      the `encoding` to be specified.
    :param progress: Callback function to track progress, called every
      `chunk_size` bytes (not available if the chunk size isn't specified). It
      will be passed two arguments, the path to the file being uploaded and the
      number of bytes transferred so far. On completion, it will be called once
      with `-1` as second argument.

    This method must be called using a `with` block:

    .. code-block:: python

      with client.read('foo') as reader:
        content = reader.read()

    This ensures that connections are always properly closed.

    .. note::

      The raw file-like object returned by this method (when called without an
      encoding, chunk size, or delimiter) can have a very different performance
      profile than local files. In particular, line-oriented methods are often
      slower. The recommended workaround is to specify an encoding when
      possible or read the entire file before splitting it.

    """
        if chunk_size < 0:
            raise ValueError('Read chunk size must be non-negative.')
        if progress and not chunk_size:
            raise ValueError(
                'Progress callback requires a positive chunk size.')
        if delimiter:
            if not encoding:
                raise ValueError('Delimiter splitting requires an encoding.')
            if chunk_size:
                raise ValueError(
                    'Delimiter splitting incompatible with chunk size.')
        _logger.info('Reading file %r.', hdfs_path)
        res = self._open(
            hdfs_path,
            offset=offset,
            length=length,
            buffersize=buffer_size,
        )
        try:
            if not chunk_size and not delimiter:
                yield codecs.getreader(encoding)(
                    res.raw) if encoding else res.raw
            else:
                # Patch in encoding  on the response object so that `iter_content` and
                # `iter_lines` can pick it up. If `None`, it is ignored and no decoding
                # happens (which is why we can always set `decode_unicode=True`).
                res.encoding = encoding
                if delimiter:
                    data = res.iter_lines(delimiter=delimiter,
                                          decode_unicode=True)
                else:
                    data = res.iter_content(chunk_size=chunk_size,
                                            decode_unicode=True)
                if progress:

                    def reader(_hdfs_path, _progress):
                        """Generator that tracks progress."""
                        nbytes = 0
                        for chunk in data:
                            nbytes += len(chunk)
                            _progress(_hdfs_path, nbytes)
                            yield chunk
                        _progress(_hdfs_path, -1)

                    yield reader(hdfs_path, progress)
                else:
                    yield data
        finally:
            res.close()
            _logger.debug('Closed response for reading file %r.', hdfs_path)
Example #50
0
def utfopen(filename):
    return codecs.getreader('utf-8')(open(filename))
Example #51
0
def load(fp, encoding='utf-8'):
    return json.load(codecs.getreader(encoding)(fp))
Example #52
0
File: web.py Project: mcuma/spack
    def _spider(url, collect_nested):
        """Fetches URL and any pages it links to.

        Prints out a warning only if the root can't be fetched; it ignores
        errors with pages that the root links to.

        Args:
            url (str): url being fetched and searched for links
            collect_nested (bool): whether we want to collect arguments
                for nested spidering on the links found in this url

        Returns:
            A tuple of:
            - pages: dict of pages visited (URL) mapped to their full text.
            - links: set of links encountered while visiting the pages.
            - spider_args: argument for subsequent call to spider
        """
        pages = {}  # dict from page URL -> text content.
        links = set()  # set of all links seen on visited pages.
        subcalls = []

        try:
            response_url, _, response = read_from_url(url, 'text/html')
            if not response_url or not response:
                return pages, links, subcalls

            page = codecs.getreader('utf-8')(response).read()
            pages[response_url] = page

            # Parse out the links in the page
            link_parser = LinkParser()
            link_parser.feed(page)

            while link_parser.links:
                raw_link = link_parser.links.pop()
                abs_link = url_util.join(response_url,
                                         raw_link.strip(),
                                         resolve_href=True)
                links.add(abs_link)

                # Skip stuff that looks like an archive
                if any(raw_link.endswith(s) for s in ALLOWED_ARCHIVE_TYPES):
                    continue

                # Skip already-visited links
                if abs_link in _visited:
                    continue

                # If we're not at max depth, follow links.
                if collect_nested:
                    subcalls.append((abs_link, ))
                    _visited.add(abs_link)

        except URLError as e:
            tty.debug(str(e))

            if hasattr(e, 'reason') and isinstance(e.reason, ssl.SSLError):
                tty.warn("Spack was unable to fetch url list due to a "
                         "certificate verification problem. You can try "
                         "running spack -k, which will not check SSL "
                         "certificates. Use this at your own risk.")

        except HTMLParseError as e:
            # This error indicates that Python's HTML parser sucks.
            msg = "Got an error parsing HTML."

            # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
            if sys.version_info[:3] < (2, 7, 3):
                msg += " Use Python 2.7.3 or newer for better HTML parsing."

            tty.warn(msg, url, "HTMLParseError: " + str(e))

        except Exception as e:
            # Other types of errors are completely ignored,
            # except in debug mode
            tty.debug("Error in _spider: %s:%s" % (type(e), str(e)),
                      traceback.format_exc())

        finally:
            tty.debug("SPIDER: [url={0}]".format(url))

        return pages, links, subcalls
Example #53
0
def main(**args):
    '''
    This corresponds to the |rstlisttable| shell command.

    :param args: Keyword arguments. If empty the arguments are taken from ``sys.argv``.

    ``rstfile`` is the file name

    ``in_place`` defaults to False

    ``join`` defaults to "012"


    '''

    import argparse
    import codecs
    import sys

    if not args:
        parser = argparse.ArgumentParser(
            description='''Convert RST grid tables to list-tables.''')
        parser.add_argument('rstfile',
                            type=argparse.FileType('r', encoding='utf-8'),
                            nargs='+',
                            help='RST file(s)')
        parser.add_argument(
            '-j',
            '--join',
            action='store',
            default='012',
            help=
            '''e.g.002. Join method per column: 0="".join; 1=" ".join; 2="\\n".join'''
        )
        parser.add_argument('-i',
                            '--in-place',
                            action='store_true',
                            default=False,
                            help='''change the file itself''')
        args = parser.parse_args().__dict__

    if not 'in_place' in args:
        args['in_place'] = False
    if not 'join' in args:
        args['join'] = '012'

    if isinstance(args['rstfile'], str):
        args['rstfile'] = [
            argparse.FileType('r', encoding='utf-8')(args['rstfile'])
        ]

    for infile in args['rstfile']:
        data = infile.readlines()
        infile.close()
        if args['in_place']:
            f = open(infile.name, 'w', encoding='utf-8', newline='\n')
        else:
            # '≥'.encode('cp1252') # UnicodeEncodeError on Windows, therefore...  makes problems with pdb, though
            sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
            sys.stdin = codecs.getreader("utf-8")(sys.stdin.detach())
            f = sys.stdout
        try:
            f.writelines(gridtable(data, args['join']))
        finally:
            if args['in_place']:
                f.close()
Example #54
0
 def test_readline(self):
     sin = "\x80".encode("base64_codec")
     reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
     sout = reader.readline()
     self.assertEqual(sout, "\x80")
     self.assert_(isinstance(sout, str))
Example #55
0
 def get_reader_from_stdin():
     """
     get a utf-8 stream reader for stdin
     @return: stdin-stream
     """
     return codecs.getreader('utf-8')(sys.stdin)
Example #56
0
    def convertFile(self, input=None, output=None, encoding=None):
        """Converts a markdown file and returns the HTML as a unicode string.

        Decodes the file using the provided encoding (defaults to utf-8),
        passes the file content to markdown, and outputs the html to either
        the provided stream or the file with provided name, using the same
        encoding as the source file. The 'xmlcharrefreplace' error handler is
        used when encoding the output.

        **Note:** This is the only place that decoding and encoding of unicode
        takes place in Python-Markdown.  (All other code is unicode-in /
        unicode-out.)

        Keyword arguments:

        * input: File object or path. Reads from stdin if `None`.
        * output: File object or path. Writes to stdout if `None`.
        * encoding: Encoding of input and output files. Defaults to utf-8.

        """

        encoding = encoding or "utf-8"

        # Read the source
        if input:
            if isinstance(input, util.string_type):
                input_file = codecs.open(input, mode="r", encoding=encoding)
            else:
                input_file = codecs.getreader(encoding)(input)
            text = input_file.read()
            input_file.close()
        else:
            text = sys.stdin.read()
            if not isinstance(text, util.text_type):  # pragma: no cover
                text = text.decode(encoding)

        text = text.lstrip('\ufeff')  # remove the byte-order mark

        # Convert
        html = self.convert(text)

        # Write to file or stdout
        if output:
            if isinstance(output, util.string_type):
                output_file = codecs.open(output,
                                          "w",
                                          encoding=encoding,
                                          errors="xmlcharrefreplace")
                output_file.write(html)
                output_file.close()
            else:
                writer = codecs.getwriter(encoding)
                output_file = writer(output, errors="xmlcharrefreplace")
                output_file.write(html)
                # Don't close here. User may want to write more.
        else:
            # Encode manually and write bytes to stdout.
            html = html.encode(encoding, "xmlcharrefreplace")
            try:
                # Write bytes directly to buffer (Python 3).
                sys.stdout.buffer.write(html)
            except AttributeError:  # pragma: no cover
                # Probably Python 2, which works with bytes by default.
                sys.stdout.write(html)

        return self
Example #57
0
 def __init__(self, f, encoding):
     self.reader = codecs.getreader(encoding)(f)
Example #58
0
    def install(self, paths, maker, **kwargs):
        """
        Install a wheel to the specified paths. If kwarg ``warner`` is
        specified, it should be a callable, which will be called with two
        tuples indicating the wheel version of this software and the wheel
        version in the file, if there is a discrepancy in the versions.
        This can be used to issue any warnings to raise any exceptions.
        If kwarg ``lib_only`` is True, only the purelib/platlib files are
        installed, and the headers, scripts, data and dist-info metadata are
        not written.

        The return value is a :class:`InstalledDistribution` instance unless
        ``options.lib_only`` is True, in which case the return value is ``None``.
        """

        dry_run = maker.dry_run
        warner = kwargs.get('warner')
        lib_only = kwargs.get('lib_only', False)

        pathname = os.path.join(self.dirname, self.filename)
        name_ver = '%s-%s' % (self.name, self.version)
        data_dir = '%s.data' % name_ver
        info_dir = '%s.dist-info' % name_ver

        metadata_name = posixpath.join(info_dir, METADATA_FILENAME)
        wheel_metadata_name = posixpath.join(info_dir, 'WHEEL')
        record_name = posixpath.join(info_dir, 'RECORD')

        wrapper = codecs.getreader('utf-8')

        with ZipFile(pathname, 'r') as zf:
            with zf.open(wheel_metadata_name) as bwf:
                wf = wrapper(bwf)
                message = message_from_file(wf)
            wv = message['Wheel-Version'].split('.', 1)
            file_version = tuple([int(i) for i in wv])
            if (file_version != self.wheel_version) and warner:
                warner(self.wheel_version, file_version)

            if message['Root-Is-Purelib'] == 'true':
                libdir = paths['purelib']
            else:
                libdir = paths['platlib']

            records = {}
            with zf.open(record_name) as bf:
                with CSVReader(stream=bf) as reader:
                    for row in reader:
                        p = row[0]
                        records[p] = row

            data_pfx = posixpath.join(data_dir, '')
            info_pfx = posixpath.join(info_dir, '')
            script_pfx = posixpath.join(data_dir, 'scripts', '')

            # make a new instance rather than a copy of maker's,
            # as we mutate it
            fileop = FileOperator(dry_run=dry_run)
            fileop.record = True  # so we can rollback if needed

            bc = not sys.dont_write_bytecode  # Double negatives. Lovely!

            outfiles = []  # for RECORD writing

            # for script copying/shebang processing
            workdir = tempfile.mkdtemp()
            # set target dir later
            # we default add_launchers to False, as the
            # Python Launcher should be used instead
            maker.source_dir = workdir
            maker.target_dir = None
            try:
                for zinfo in zf.infolist():
                    arcname = zinfo.filename
                    if isinstance(arcname, text_type):
                        u_arcname = arcname
                    else:
                        u_arcname = arcname.decode('utf-8')
                    # The signature file won't be in RECORD,
                    # and we  don't currently don't do anything with it
                    if u_arcname.endswith('/RECORD.jws'):
                        continue
                    row = records[u_arcname]
                    if row[2] and str(zinfo.file_size) != row[2]:
                        raise DistlibException('size mismatch for '
                                               '%s' % u_arcname)
                    if row[1]:
                        kind, value = row[1].split('=', 1)
                        with zf.open(arcname) as bf:
                            data = bf.read()
                        _, digest = self.get_hash(data, kind)
                        if digest != value:
                            raise DistlibException('digest mismatch for '
                                                   '%s' % arcname)

                    if lib_only and u_arcname.startswith((info_pfx, data_pfx)):
                        logger.debug('lib_only: skipping %s', u_arcname)
                        continue
                    is_script = (u_arcname.startswith(script_pfx)
                                 and not u_arcname.endswith('.exe'))

                    if u_arcname.startswith(data_pfx):
                        _, where, rp = u_arcname.split('/', 2)
                        outfile = os.path.join(paths[where], convert_path(rp))
                    else:
                        # meant for site-packages.
                        if u_arcname in (wheel_metadata_name, record_name):
                            continue
                        outfile = os.path.join(libdir, convert_path(u_arcname))
                    if not is_script:
                        with zf.open(arcname) as bf:
                            fileop.copy_stream(bf, outfile)
                        outfiles.append(outfile)
                        # Double check the digest of the written file
                        if not dry_run and row[1]:
                            with open(outfile, 'rb') as bf:
                                data = bf.read()
                                _, newdigest = self.get_hash(data, kind)
                                if newdigest != digest:
                                    raise DistlibException('digest mismatch '
                                                           'on write for '
                                                           '%s' % outfile)
                        if bc and outfile.endswith('.py'):
                            try:
                                pyc = fileop.byte_compile(outfile)
                                outfiles.append(pyc)
                            except Exception:
                                # Don't give up if byte-compilation fails,
                                # but log it and perhaps warn the user
                                logger.warning('Byte-compilation failed',
                                               exc_info=True)
                    else:
                        fn = os.path.basename(convert_path(arcname))
                        workname = os.path.join(workdir, fn)
                        with zf.open(arcname) as bf:
                            fileop.copy_stream(bf, workname)

                        dn, fn = os.path.split(outfile)
                        maker.target_dir = dn
                        filenames = maker.make(fn)
                        fileop.set_executable_mode(filenames)
                        outfiles.extend(filenames)

                if lib_only:
                    logger.debug('lib_only: returning None')
                    dist = None
                else:
                    # Generate scripts

                    # Try to get pydist.json so we can see if there are
                    # any commands to generate. If this fails (e.g. because
                    # of a legacy wheel), log a warning but don't give up.
                    commands = None
                    file_version = self.info['Wheel-Version']
                    if file_version == '1.0':
                        # Use legacy info
                        ep = posixpath.join(info_dir, 'entry_points.txt')
                        try:
                            with zf.open(ep) as bwf:
                                epdata = read_exports(bwf)
                            commands = {}
                            for key in ('console', 'gui'):
                                k = '%s_scripts' % key
                                if k in epdata:
                                    commands['wrap_%s' % key] = d = {}
                                    for v in epdata[k].values():
                                        s = '%s:%s' % (v.prefix, v.suffix)
                                        if v.flags:
                                            s += ' %s' % v.flags
                                        d[v.name] = s
                        except Exception:
                            logger.warning('Unable to read legacy script '
                                           'metadata, so cannot generate '
                                           'scripts')
                    else:
                        try:
                            with zf.open(metadata_name) as bwf:
                                wf = wrapper(bwf)
                                commands = json.load(wf).get('extensions')
                                if commands:
                                    commands = commands.get('python.commands')
                        except Exception:
                            logger.warning('Unable to read JSON metadata, so '
                                           'cannot generate scripts')
                    if commands:
                        console_scripts = commands.get('wrap_console', {})
                        gui_scripts = commands.get('wrap_gui', {})
                        if console_scripts or gui_scripts:
                            script_dir = paths.get('scripts', '')
                            if not os.path.isdir(script_dir):
                                raise ValueError('Valid script path not '
                                                 'specified')
                            maker.target_dir = script_dir
                            for k, v in console_scripts.items():
                                script = '%s = %s' % (k, v)
                                filenames = maker.make(script)
                                fileop.set_executable_mode(filenames)

                            if gui_scripts:
                                options = {'gui': True}
                                for k, v in gui_scripts.items():
                                    script = '%s = %s' % (k, v)
                                    filenames = maker.make(script, options)
                                    fileop.set_executable_mode(filenames)

                    p = os.path.join(libdir, info_dir)
                    dist = InstalledDistribution(p)

                    # Write SHARED
                    paths = dict(paths)  # don't change passed in dict
                    del paths['purelib']
                    del paths['platlib']
                    paths['lib'] = libdir
                    p = dist.write_shared_locations(paths, dry_run)
                    if p:
                        outfiles.append(p)

                    # Write RECORD
                    dist.write_installed_files(outfiles, paths['prefix'],
                                               dry_run)
                return dist
            except Exception:  # pragma: no cover
                logger.exception('installation failed.')
                fileop.rollback()
                raise
            finally:
                shutil.rmtree(workdir)
Example #59
0
def main():
    import sys
    import os
    from optparse import OptionParser
    logging.basicConfig()

    parser = OptionParser()
    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      help="name of the object file",
                      metavar="FILE")
    parser.add_option("-v",
                      "--verbose",
                      action="store_true",
                      dest="verbose",
                      default=False,
                      help="print status messages")
    parser.add_option("--debug",
                      action="store_true",
                      dest="debug",
                      default=False,
                      help="print debug messages to stdout")
    parser.add_option("-D",
                      "--define",
                      action="append",
                      dest="defines",
                      metavar="SYM[=VALUE]",
                      default=[],
                      help="define symbol")
    parser.add_option(
        "-I",
        "--include-path",
        action="append",
        dest="include_paths",
        metavar="PATH",
        default=[],
        help="Add directory to the search path list for includes")

    (options, args) = parser.parse_args()

    if len(args) > 1:
        sys.stderr.write("Only one file at a time allowed.\n")
        sys.exit(1)

    if options.debug:
        logging.getLogger('cpp').setLevel(logging.DEBUG)
    elif options.verbose:
        logging.getLogger('cpp').setLevel(logging.INFO)
    else:
        logging.getLogger('cpp').setLevel(logging.WARN)

    if options.outfile:
        outfile = codecs.open(options.outfile, 'w', 'utf-8')
    else:
        outfile = codecs.getwriter("utf-8")(sys.stdout)

    cpp = msp430.asm.cpp.Preprocessor()
    # extend include search path
    # built in places for msp430.asm
    d = os.path.join(os.path.dirname(sys.modules['msp430.asm'].__file__),
                     'include')
    cpp.include_path.append(d)
    cpp.include_path.append(os.path.join(d, 'upstream'))
    # user provided directories (-I)
    cpp.include_path.extend(options.include_paths)
    # insert predefined symbols (XXX function like macros not yet supported)
    for definition in options.defines:
        if '=' in definition:
            symbol, value = definition.split('=', 1)
        else:
            symbol, value = definition, '1'
        cpp.namespace.defines[symbol] = value

    if not args or args[0] == '-':
        infilename = '<stdin>'
        infile = codecs.getreader("utf-8")(sys.stdin)
    else:
        # search include path for files
        for path in cpp.include_path:
            infilename = os.path.join(path, args[0])
            if os.path.exists(infilename):
                infile = codecs.open(infilename, 'r', 'utf-8')
                break
        else:
            sys.stderr.write('h2forth: %s: File not found\n' % (infilename, ))
            sys.exit(1)

    try:
        error_found = cpp.preprocess(infile, msp430.asm.cpp.Discard(),
                                     infilename)
        if error_found:
            sys.exit(1)
    except msp430.asm.cpp.PreprocessorError, e:
        sys.stderr.write('%s:%s: %s\n' % (e.filename, e.line, e))
        if options.debug:
            if hasattr(e, 'text'):
                sys.stderr.write('%s:%s: input line: %r\n' %
                                 (e.filename, e.line, e.text))
        sys.exit(1)
 def decode(self, text_utf8, text_latex, inputenc=None):
     encoding = 'latex+' + inputenc if inputenc else 'latex'
     stream = BytesIO(text_latex)
     reader = codecs.getreader(encoding)(stream)
     self.assertEqual(text_utf8, reader.read())