Example #1
0
def decode(filename, data):
    if '.m3u8' in filename:
        encoding = 'utf-8'
        data = data.decode(encoding)
    elif '.m3u' in filename or '.pls' in filename:
        try:
            encoding = 'ISO-8859-2'
            data = data.decode(encoding)
        except:
            if chardet:
                u = UniversalDetector()
                u.feed(data)
                u.close()
                if u.result['confidence'] > 0.5:
                    try:
                        encoding = result['encoding']
                        data = data.decode(encoding)
                    except:
                        encoding = 'ascii'
                else:
                    encoding = 'ascii'
            else:
                encoding = 'ascii'
    elif '.xml' in filename or '.xspf' in filename:
        encoding = 'utf-8'

    return {'data' : data, 'encoding' : encoding}
def validate_csv(f):
    """Return dialect information about given csv file."""
    with open(f.fullpath, 'rU') as csvfile:
        is_valid = False
        try:
            dialect = csv.Sniffer().sniff(csvfile.read(1024))
        except Exception as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (f.name + f.superformat, e))
            return {
                'delimiter': '',
                'encoding': '',
                'is_valid': is_valid
            }
        u = UniversalDetector()
        dialect.strict = True
        csvfile.seek(0)
        reader = csv.reader(csvfile, dialect)
        try:
            for row in reader:
                u.feed(dialect.delimiter.join(row))
            is_valid = True
        except csv.Error as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (f.name + f.superformat, e))
        finally:
            u.close()
    return {
        'delimiter': dialect.delimiter,
        'encoding': u.result['encoding'],
        'is_valid': is_valid
    }
Example #3
0
 def repo_cat_file(self, repo_path, commit_hash, path):
     (commit_hash, path) = self._all_to_utf8(commit_hash, path)
     if not self._path_check_chdir(repo_path, commit_hash, path):
         return ''
     path = self._get_quote_path(path)
     if path.startswith('./'):
         path = path[2:]
     file_type = path.split('.')[-1]
     if file_type in BINARY_FILE_TYPE:
         return u'二进制文件'
     stage_file = self._get_stage_file(repo_path, commit_hash, path)
     result = self._read_load_stage_file(stage_file)
     if result is not None:
         return result['blob']
     command = '/usr/bin/git show %s:%s | /usr/bin/head -c 524288' % (commit_hash, path)
     try:
         signal.signal(signal.SIGPIPE, signal.SIG_DFL)
         result = check_output(command, shell=True)
         ud = UniversalDetector()
         ud.feed(result)
         ud.close()
         if ud.result['encoding']:
             encoding = ud.result['encoding']
             if encoding != 'utf-8' or encoding != 'utf8':
                 result = result.decode(encoding).encode('utf-8')
         self._dumps_write_stage_file({'blob': result}, stage_file)
         return result
     except Exception, e:
         logger.exception(e)
    def detectEncoding(self, parseMeta=True, chardet=True):
        # First look for a BOM
        # This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        # If there is no BOM need to look for meta elements with encoding
        # information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        # Guess with chardet, if available
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = lookupEncoding(self.defaultEncoding)

        return encoding, confidence
Example #5
0
def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        u = UniversalDetector()
        for line in f:
            u.feed(line)
        u.close()
        return u.result['encoding']
Example #6
0
def detect_encoding(f, verbose=False):
    """Detects a file's encoding.

    Args:
        f (obj): The file like object to detect.
        verbose (Optional[bool]): The file open mode (default: False).
        mode (Optional[str]): The file open mode (default: 'rU').

    Returns:
        dict: The encoding result

    Examples:
        >>> filepath = p.join(DATA_DIR, 'test.csv')
        >>> with open(filepath, 'rb') as f:
        ...     result = detect_encoding(f)
        ...     result == {'confidence': 0.99, 'encoding': 'utf-8'}
        True
    """
    pos = f.tell()
    detector = UniversalDetector()

    for line in f:
        detector.feed(line)

        if detector.done:
            break

    detector.close()
    f.seek(pos)

    if verbose:
        print('result', detector.result)

    return detector.result
def detect(view, file_name, cnt):
	#traceback.print_stack()
	print("detect...")
	if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
		return
	encoding = encoding_cache.pop(file_name)
	
	if encoding:
		print("it is already at cache encoding_cache.json:",encoding)
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	fp = open(file_name, 'rb')
	for line in fp:
		# cut MS-Windows CR code
		line = line.replace(b'\r',b'')
		detector.feed(line)
		#print(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	print(encoding)
	if encoding:
		encoding = encoding.upper()
	confidence = detector.result['confidence']
	print(confidence)
	sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
Example #8
0
def get_coding(text, force_chardet=False):
    """
    Function to get the coding of a text.
    @param text text to inspect (string)
    @return coding string
    """
    if not force_chardet:
        for line in text.splitlines()[:2]:
            try:
                result = CODING_RE.search(to_text_string(line))
            except UnicodeDecodeError:
                # This could fail because to_text_string assume the text
                # is utf8-like and we don't know the encoding to give
                # it to to_text_string
                pass
            else:
                if result:
                    codec = result.group(1)
                    # sometimes we find a false encoding that can
                    # result in errors
                    if codec in CODECS:
                        return codec

    # Fallback using chardet
    if is_binary_string(text):
        detector = UniversalDetector()
        for line in text.splitlines()[:2]:
            detector.feed(line)
            if detector.done: break

        detector.close()
        return detector.result['encoding']

    return None
Example #9
0
def detect_local_charset(filepath):
    global VERBOSE
    # Open to read in binary.
    fp = open(filepath, "rb")
    detector = UniversalDetector()

    if VERBOSE:
        print "Reading file to detect encoding..."

    for line in fp:
        line = line.replace(b'\r',b'')
        detector.feed(line)
        if detector.done:
            break

    fp.close()
    detector.close()

    if VERBOSE:
        print "Encoding: %s" % detector.result["encoding"]
        print "Confidence: {0:.0f}% ".format(detector.result["confidence"]*100)

    if detector.result["confidence"] > 0.75:
        encoding = detector.result["encoding"]
        return encoding.replace('-','_').lower() # Format for codecs
    else:
        return None
Example #10
0
File: mime.py Project: Darshnik/dxr
def decode_data(data, encoding_guess, can_be_binary=True):
    """Given string data, return an (is_text, data) tuple, where data is
    returned as unicode if we think it's text and were able to determine an
    encoding for it.
    If can_be_binary is False, then skip the initial is_binary check.
    """
    if not (can_be_binary and is_binary_string(data[:1024])):
        try:
            # Try our default encoding.
            data = data.decode(encoding_guess)
            return True, data
        except UnicodeDecodeError:
            # Fall back to chardet - chardet is really slow, which is why we
            # don't just do chardet from the start.
            detector = UniversalDetector()
            for chunk in ichunks(80, data):
                detector.feed(chunk)
                if detector.done:
                    break
            detector.close()
            if detector.result['encoding']:
                try:
                    data = data.decode(detector.result['encoding'])
                    return True, data
                except (UnicodeDecodeError, LookupError):
                    # Either we couldn't decode or chardet gave us an encoding
                    # that python doesn't recognize (yes, it can do that).
                    pass  # Leave data as str.
    return False, data
Example #11
0
def detect_encoding(bytes, encoding=None):
    """Detect encoding of a byte stream.
    """
    # To reduce tabulator import time
    from chardet.universaldetector import UniversalDetector
    if encoding is not None:
        if encoding.lower() == 'utf-8':
            prefix = bytes.read(len(codecs.BOM_UTF8))
            if prefix == codecs.BOM_UTF8:
                encoding = 'utf-8-sig'
            bytes.seek(0)
        return encoding
    detector = UniversalDetector()
    num_lines = config.ENCODING_DETECTION_MAX_LINES
    while num_lines > 0:
        line = bytes.readline()
        detector.feed(line)
        if detector.done:
            break
        num_lines -= 1
    detector.close()
    bytes.seek(0)
    confidence = detector.result['confidence']
    encoding = detector.result['encoding']
    # Do not use if not confident
    if confidence < config.ENCODING_DETECTION_MIN_CONFIDENCE:
        encoding = config.DEFAULT_ENCODING
    # Default to utf-8 for safety
    if encoding == 'ascii':
        encoding = config.DEFAULT_ENCODING
    return encoding
Example #12
0
def guessWithChardet(content):
    u = UniversalDetector()
    for line in content:
        u.feed(line)
    u.close()
    result = u.result
    return result
Example #13
0
def detect_encoding(file):
    detector = UniversalDetector()
    for line in open(file, 'rb').readlines():
        detector.feed(line)
        if detector.done: break
    detector.close()
    return detector.result['encoding']
Example #14
0
def get_file_encoding(file_name):
    if not os.path.isfile(file_name):
        return ""
    u = UniversalDetector()
    with open(file_name, "rb") as f:
        for index, line in enumerate(f):
            u.feed(line)
            if index > 500:
                break
        u.close()
    if u.result["encoding"].lower() == "gb2312":
        try:
            _file = codecs.open(file_name, encoding="gb2312")
            _file.readlines()
            result = "gb2312"
        except Exception as e:
            print e
            try:
                _file = codecs.open(file_name, encoding="gbk")
                _file.readlines()
                result = "gbk"
            except Exception as e:
                print e
                result = "gb18030"
    else:
        result = u.result["encoding"]
    return result
Example #15
0
def detect(view, file_name, encoding):
	if not file_name or not os.path.exists(file_name):
		return
	if not encoding.endswith(' with BOM'):
		encoding = encoding_cache.pop(file_name)
	if encoding:
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	cnt = SETTINGS['max_detect_lines']
	fp = open(file_name, 'rb')
	for line in fp:
		# cut MS-Windows CR code
		line = line.replace(b'\r',b'')
		detector.feed(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	if encoding:
		encoding = encoding.upper()
	confidence = detector.result['confidence']
	sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def process_buffer(buf, d):
    if not buf:
        return
    header = buf[0]
    url = header.split()[1]
    skip = 0
    empty_lines = 0
    while empty_lines < 2:
        skip += 1
        if not buf[skip].strip():
            empty_lines += 1

    rawhtml = "".join(buf[skip + 1:])
    html = None
    try:
        html = rawhtml.decode("utf-8")
    except:
        try:
            detector = UniversalDetector()
            for line in buf[skip + 1:]:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result
            html = rawhtml.decode(encoding["encoding"])
        except:
            html = rawhtml.decode("utf-8", errors='ignore')
    assert html is not None, "Error processing %s\n" % rawhtml
    html = html.replace(r"\r", "")
    d[url] = (header, html)
Example #17
0
def transferToEncoding(filename, toCode):
	if os.path.isdir(filename):
		print "error:not file"
		return False

	try:
		detector = UniversalDetector()
		f = open(filename, 'r')
		ls = f.readlines()
		f.close()

		# 如果空文件没法探测到,所以直接跳出做提示即可
		if len(ls) == 0: 
			print printRed(filename), printRed(' is blank file, can not detect encoding')
			return False;

		# 探测编码
		for l in ls:
			detector.feed(l)
			if detector.done: break
		detector.close()
		
		encode = gb(detector.result['encoding'])
		if encode.lower() != toCode.lower():
			f = open(filename, 'w')
			print printGreen(filename) + ' ====> ' + toCode + ' SUCCESS'
			for l in ls:
				f.write(unicode(l, encode).encode(toCode))
			f.close()
		else:
			pass		
	except Exception, e:
		traceback.print_exc()
		print 'exception'
def detect(view, file_name):
	if not os.path.exists(file_name):
		return
	encoding = encoding_cache.pop(file_name)
	if encoding:
		sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
		return
	sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0)
	detector = UniversalDetector()
	cnt = SETTINGS['max_detect_lines']
	fp = file(file_name, 'rb')
	for line in fp:
		detector.feed(line)
		cnt -= 1
		if detector.done or cnt == 0:
			break
	fp.close()
	detector.close()
	encoding = detector.result['encoding']
	confidence = detector.result['confidence']
	if not encoding or confidence < 0.7:
		sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Encoding can not be detected, please choose one manually. (%s/%.2f)' % (encoding, confidence)), 0)
		return
	encoding = encoding.upper()
	if encoding == 'BIG5':
		encoding = 'BIG5-HKSCS'
	elif encoding == 'GB2312':
		encoding = 'GBK'
	sublime.set_timeout(lambda: init_encoding_vars(view, encoding), 0)
Example #19
0
def get_unicode_content(file_path, encoding=None):
    """
    Return a unicode string of the files contents using the given encoding.  If no encoding is given
    then chardet will be used to determine the encoding.
    Note that this uses the chardet library and may cause problems, if an error is thrown then
    a utf-8 encoding is assumed and unrecognize caracters are discarded.
    """
    from chardet.universaldetector import UniversalDetector
    
    try:
        if not encoding:
            detector = UniversalDetector()
            contents = ''
            with open(file_path, 'rb') as f:
                contents = f.read()
                detector.feed(contents)
            detector.close()
            determined_encoding = detector.result['encoding']
            return contents.decode(encoding=determined_encoding)
        else:
            with open(file_path, 'r') as f:
                return unicode(f.read(), encoding=encoding, errors='ignore')
    except UnicodeError:
        with open(file_path, 'r') as f:
            return unicode(f.read(), encoding='utf-8', errors='ignore')
Example #20
0
def detect(view, file_name, encoding):
    if not file_name or not os.path.exists(file_name):
        return
    if not encoding.endswith(" with BOM"):
        encoding = encoding_cache.pop(file_name)
    if encoding:
        sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0)
        return
    sublime.set_timeout(lambda: view.set_status("origin_encoding", "Detecting encoding, please wait..."), 0)
    detector = UniversalDetector()
    cnt = SETTINGS["max_detect_lines"]
    fp = open(file_name, "rb")
    for line in fp:
        # cut MS-Windows CR code
        line = line.replace(b"\r", b"")
        detector.feed(line)
        cnt -= 1
        if detector.done or cnt == 0:
            break
    fp.close()
    detector.close()
    encoding = detector.result["encoding"]
    if encoding:
        encoding = encoding.upper()
        if encoding == "BIG5":
            encoding = "BIG5-HKSCS"
        elif encoding == "GB2312":
            encoding = "GBK"
    confidence = detector.result["confidence"]
    sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
Example #21
0
File: dyr.py Project: iacopy/dyr
def description_of(file_path, name='stdin', byte=1000000):
    """
    Return a string describing the probable encoding of a file.
    """
    from chardet.universaldetector import UniversalDetector
    file = open(file_path, 'rb')
    u = UniversalDetector()
    i = 0
    for line in file:
        l = len(line)
        if i + l > byte:
            bytoread = byte-i
            u.feed(line[:bytoread])
            break
        else:
            bytoread = l
            u.feed(line)
        i += bytoread
    file.close()
    u.close()
    result = u.result
    if result['encoding']:
        return '%s: %s with confidence %s' % (name,
                                              result['encoding'],
                                              result['confidence'])
    else:
        return '%s: no result' % name
def sanitize_texts(directory):
    """
    Strip all header and copyright information from downloaded text files in the
    specified directory using gutenberg.strip_headers module and ensure proper
    file encodings.

    :param directory: <String> A string containing the full path to directory containing files to strip
    :return:
    """

    for item in os.listdir(directory):
        file_path = os.path.join(directory, item)
        if os.path.isfile(file_path):

            # Detect file encoding, takes time to run
            with open(file_path, 'rb') as inf:
                text = inf.readlines()
            detector = UniversalDetector()
            for line in text:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result['encoding']

            # Open file, strip headers, and save result
            with open(file_path, 'r', encoding=encoding) as inf:
                text = inf.read()
            text = strip_headers(text).strip()
            os.remove(file_path)
            with open(file_path, 'w+', encoding=encoding) as outf:
                outf.write(text)
Example #23
0
def safe_open(path, mode='r'):
    '''
    Retrieves a file's encoding and returns the opened file. If the opened file
    begins with a BOM, it is read before the file object is returned. This
    allows callers to not have to handle BOMs of files.

    :param str path: file path to open
    :param str mode: the mode to open the file (see :func:`open`)
    :returns file: the opened file object
    '''
    u = UniversalDetector()
    first = None
    with open(path, 'rb') as fp:
        bin = first = fp.read(0x1000)

        while not u.done and bin:
            u.feed(bin)
            if not u.done:
                bin = fp.read(0x1000)
    u.close()

    if not first:
        return open(path, mode)

    fp = codecs.open(path, mode, encoding=u.result['encoding'])
    for bom in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8,
                codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE):
        if first.startswith(bom):
            fp.seek(len(bom))
            break

    return fp
Example #24
0
    def _guessEncoding(self, path):
        """Opens a file from the given `path` and checks the file encoding.

        The file must exists on the file system and end with the extension
        `.csv`. The file is read line by line until the encoding could be
        guessed.
        On a successfull identification, the widgets of this dialog will be
        updated.

        Args:
            path (string): Path to a csv file on the file system.

        """
        if os.path.exists(path) and path.lower().endswith('csv'):
            encodingDetector = UniversalDetector()
            with open(path, 'r') as fp:
                for line in fp:
                    encodingDetector.feed(line)
                    if encodingDetector.done:
                        break
            encodingDetector.close()
            result = encodingDetector.result['encoding']
            result = result.replace('-','_')

            self._encodingKey = _calculateEncodingKey(result)
            if self._encodingKey:
                index = self._encodingComboBox.findText(result.upper())
                self._encodingComboBox.setCurrentIndex(index)
def listTextBasedFiles(file):
    try:
        # Detect MIME type for file
        # https://github.com/kaito834/myNotes/blob/master/snippets/python/magic_from_file.py
        # https://github.com/ahupp/python-magic#usage
        f_mimetype = magic.from_file(file, mime=True)
    except Exception as e:
        print("[!] Exception: {0} ({1})".format(e, type(e)))

    # Open and count lines if MIME type of the file is text/*
    if f_mimetype.split('/')[0] == 'text':
        # Detect encoding by chardet.universaldetector.UniversalDetector()
        # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage
        detector = UniversalDetector()
        with open(file, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()

        with open(file, "r", encoding=detector.result['encoding']) as f:
            line_count = 0
            for line in f.readlines():
                line_count += 1
            print("{0}: {1}, {2}, {3} lines".format(file, f_mimetype, detector.result['encoding'], line_count))
    else:
        print("{0}: NOT txet based file (reason: MIME type isn't text/*: {1})".format(file, f_mimetype))
Example #26
0
def description_of(lines, name='stdin'):
    """
    Return a string describing the probable encoding of a file or
    list of strings.

    :param lines: The lines to get the encoding of.
    :type lines: Iterable of bytes
    :param name: Name of file or collection of lines
    :type name: str
    """
    u = UniversalDetector()
    for line in lines:
        line = bytearray(line)
        u.feed(line)
        # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
        if u.done:
            break
    u.close()
    result = u.result
    if PY2:
        name = name.decode(sys.getfilesystemencoding(), 'ignore')
    if result['encoding']:
        return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
                                                     result['confidence'])
    else:
        return '{0}: no result'.format(name)
def validate_csv(document):
    """Return dialect information about given csv file."""
    with open(document.document.uri, 'rU') as csvfile:
        is_valid = False
        try:
            dialect = csv.Sniffer().sniff(csvfile.read(1024))
        except Exception as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (document.get_filename(), e))
            return {
                'delimiter': '',
                'encoding': '',
                'is_valid': is_valid
            }
        universal_detector = UniversalDetector()
        dialect.strict = True
        csvfile.seek(0)
        reader = csv.reader(csvfile, dialect)
        try:
            for row in reader:
                universal_detector.feed(
                    dialect.delimiter.join(row).encode('utf-8'))
            is_valid = True
        except csv.Error as e:
            current_app.logger.debug(
                'File %s is not valid CSV: %s' % (document.get_filename(), e))
        finally:
            universal_detector.close()
    return {
        'delimiter': dialect.delimiter,
        'encoding': universal_detector.result['encoding'],
        'is_valid': is_valid
    }
Example #28
0
def decode(string):
    """ detects string encoding and returns decoded string"""
    u = UniversalDetector()
    u.feed(string)
    u.close()
    result = u.result
    return string.decode(result['encoding'])
Example #29
0
    def deserialize(file_bytes):
        try:
            file_string = file_bytes.decode('utf-8')
        except UnicodeDecodeError as ude:
            detector = UniversalDetector()
            for line in BytesIO(file_bytes):
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            if detector.result['confidence'] < 0.5:
                raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files.")
            try:
                file_string = file_bytes.decode(detector.result['encoding'])
            except UnicodeDecodeError:
                raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files. "
                                 "(The invalid character is '{char:#x}' at {pos})".format(pos=ude.start,
                                                                                          char=file_bytes[ude.start]))
        csv_lines = file_string.splitlines()
        first_line = csv_lines[:1]
        first_row_tab = next(csv.reader(first_line, delimiter="\t"))
        first_row_semicolon = next(csv.reader(first_line, delimiter=";"))
        if len(first_row_tab) > 1:
            rows = csv.reader(csv_lines, delimiter="\t")
        elif len(first_row_semicolon) > 1:
            rows = csv.reader(csv_lines, delimiter=";")
        else:
            raise ValueError("Csv file is not delimited by ';' or 'tab'")

        return rows
Example #30
0
def get_csv_reader(filename, charset=None):
    logger.info("Reading CSV file %s", filename)

    myfile = open(filename, "rb")

    if not charset:
        # Detect encoding
        detector = UniversalDetector()
        for line in myfile.xreadlines():
            detector.feed(line)

            if detector.result["confidence"] > 0.01:
                logger.debug("Result so far: %s", detector.result)

            if detector.done:
                break
        detector.close()
        charset = detector.result["encoding"]

        logger.info("Found encoding %s", charset)

        # Reset the file index
        myfile.seek(0)

    # Attempt to detect the dialect
    encodedfile = codecs.EncodedFile(myfile, charset)
    dialect = csv.Sniffer().sniff(encodedfile.read(1024))

    logger.info("Found dialect %s", dialect)

    # Reset the file index
    myfile.seek(0)

    return UnicodeReader(myfile, dialect=dialect, encoding=charset)
Example #31
0
class FileOpener(object):
    def __init__(self, use_chardet, quiet_level):
        self.use_chardet = use_chardet
        if use_chardet:
            self.init_chardet()
        self.quiet_level = quiet_level

    def init_chardet(self):
        try:
            from chardet.universaldetector import UniversalDetector
        except ImportError:
            raise ImportError("There's no chardet installed to import from. "
                              "Please, install it and check your PYTHONPATH "
                              "environment variable")

        self.encdetector = UniversalDetector()

    def open(self, filename):
        if self.use_chardet:
            return self.open_with_chardet(filename)
        else:
            return self.open_with_internal(filename)

    def open_with_chardet(self, filename):
        self.encdetector.reset()
        with codecs.open(filename, 'rb') as f:
            for line in f:
                self.encdetector.feed(line)
                if self.encdetector.done:
                    break
        self.encdetector.close()
        encoding = self.encdetector.result['encoding']

        try:
            f = codecs.open(filename, 'r', encoding=encoding)
        except UnicodeDecodeError:
            print("ERROR: Could not detect encoding: %s" % filename,
                  file=sys.stderr)
            raise
        except LookupError:
            print("ERROR: Don't know how to handle encoding %s: %s"
                  % (encoding, filename,), file=sys.stderr)
            raise
        else:
            lines = f.readlines()
            f.close()

        return lines, encoding

    def open_with_internal(self, filename):
        curr = 0
        while True:
            try:
                f = codecs.open(filename, 'r', encoding=encodings[curr])
            except UnicodeDecodeError:
                if not self.quiet_level & QuietLevels.ENCODING:
                    print("WARNING: Decoding file using encoding=%s failed: %s"
                          % (encodings[curr], filename,), file=sys.stderr)
                    try:
                        print("WARNING: Trying next encoding %s"
                              % encodings[curr + 1], file=sys.stderr)
                    except IndexError:
                        pass

                curr += 1
            else:
                lines = f.readlines()
                f.close()
                break
        if not lines:
            raise Exception('Unknown encoding')

        encoding = encodings[curr]

        return lines, encoding
Example #32
0
import sys, glob

sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector

count = 0
u = UniversalDetector()
for f in glob.glob(src.argv[1]):
    for f in g:
        print f.ljust(60),
        u.reset()
        for line in file(f, 'rb'):
            u.feed(line)
            if u.done: break
        u.close()
        result = u.result
        if result['encoding']:
            print result['encoding'], 'with confidence', result['confidence']
        else:
            print '******** no result'
        count += 1
print count, 'tests'
from chardet.universaldetector import UniversalDetector

detector = UniversalDetector()

with open("tmp.txt", mode='rb') as f:
    for b in f:
        detector.feed(b)
        if detector.done:
            break

detector.close()
print(detector.result)
Example #34
0
def read_into_dataframe(file, filename=None, nrows=100, max_characters=50):
    """
    Reads a file into a DataFrame.
    Infers the file encoding and whether a header column exists
    The file can be in any format (.csv, .txt, .zip, .gif,...).
    If it's not a .csv file, it will throw an exception (pandas.errors.EmptyDataError).
    One-column .csv gives exception there in try...except.

    Parameters
    ----------
    file : IO
        File buffer.
    filename : str
        Filename. Used to infer compression. Default to None.
    nrows : int
        Number of rows to peek. Default to 100.
    max_characters : int
        Max characters a column name can have to be distinguished from a real text value. Default to 50.

    Returns
    -------
    pd.DataFrame
        The dataframe content.

    Raises
    ------
    pandas.errors.EmptyDataError

    Notes
    -----
    If no filename is given, a hex uuid will be used as the file name.
    """

    detector = UniversalDetector()
    for line, text in enumerate(file):
        detector.feed(text)
        if detector.done or line > nrows:
            break
    detector.close()
    encoding = detector.result.get("encoding")

    if filename is None:
        filename = uuid4().hex

    compression = infer_compression(filename, "infer")

    file.seek(0, SEEK_SET)

    pdread = TextIOWrapper(file, encoding=encoding)

    try:
        # check if the file has header.
        sniffer = csv.Sniffer()
        pdread.seek(0, SEEK_SET)
        pdreadline = pdread.readline()
        pdreadline += pdread.readline()
        has_header = sniffer.has_header(pdreadline)
        sep = None

    except csv.Error:
        sep = ","
        has_header = True

    # Prefix and header
    header = "infer" if has_header else None
    prefix = None if header else "col"

    pdread.seek(0, SEEK_SET)
    df = pd.read_csv(
        pdread,
        encoding=encoding,
        compression=compression,
        sep=sep,
        engine="python",
        header=header,
        nrows=nrows,
        prefix=prefix,
    )
    return df
def parse_dat_file(dat_path, spec_csv_path, out_folder):
    """Parse a .DAT file (CSPro fixed-width text datafile) into a series of CSV files 
    containing the tabular data for each table contained in the .DAT and described in the 
    associated .DCD file. 
    
    Developed for use in particular with DAT files provided in the "hierarchical data"
    from DHS, but may be more generally applicable to CSPro format files. The .DCF file 
    must be parsed first, using DCF_Parser, and the table specification file it 
    generates is used by this function to parse the data file.
    
    Produces one CSV data file for every table (recordtype) defined in the .DCF and occurring in 
    the .DAT. """
    filecode = os.path.extsep.join(os.path.basename(dat_path).split(os.path.extsep)[:-1])

    # See if we've already done this one
    test_fn = os.path.join(out_folder, f"{filecode}.REC01.csv")
    if os.path.exists(test_fn):
        print("Already parsed " + filecode)
        return
    print("Parsing "+dat_path)

    # read the parsed file specification in CSV form which was created by parsing the .dcf file
    # The first row specifies where, on all subsequent rows, the "record type" is found i.e. the identifier
    # that specifies which table the row defines a variable for. This is constant throughout the file.
    # Each remaining item in the parsed DCF spec defines one field from one table, specifying what position that
    # field's data is found in the fixed-width text format row when the row's record_type_info
    # (destination table name) is for this table
    with open(spec_csv_path, 'r') as dict_file:
        dict_file_reader = csv.DictReader(dict_file)
        # the record type position info must be in the first line
        recordtype_info = next(dict_file_reader)
        rt_start = int(recordtype_info['Start']) - 1
        rt_end = int(recordtype_info['Len']) + rt_start
        all_vars_this_file = [row for row in dict_file_reader]
    for field_info in all_vars_this_file:
        field_info['Start'] = int(field_info['Start'])
        field_info['Len'] = int(field_info['Len'])
    # sort them by record type (i.e. destination table) then position in the row (order of fields)
    sorted_fields = sorted(all_vars_this_file, key=(itemgetter('RecordTypeValue', 'Start')))

    # build a dictionary of record type (i.e. tablename) : list of its fields (i.e. field infos)
    rt_field_info = {}
    for field_info in sorted_fields:
        record_tag = field_info['RecordTypeValue']
        if record_tag not in rt_field_info:
            rt_field_info[record_tag] = []
        rt_field_info[record_tag].append(field_info)

    # now parse the data file
    result = {}
    n_cols_per_table = {}

    detector = UniversalDetector()
    with open(dat_path, 'rb') as f:
        for line in f:
            detector.feed(line)
            if detector.done: break
        detector.close()
        enc = detector.result['encoding']

    with open(dat_path, 'r', encoding=enc) as data:
        for i, line in enumerate(data):
            #if i == 0 and line.startswith(codecs.BOM_UTF8):
            #    print(f"File {dat_path} appears to contain BOM; ignoring it")
            #    line = line[len(codecs.BOM_UTF8):]
            record_type = line[rt_start:rt_end]
            if record_type not in rt_field_info:
                print("Specification for recordtype '{0!s}' not found in file for {1!s} at line {2!s}".format(
                    record_type, filecode, i))
                continue
            record_spec = rt_field_info[record_type]
            if record_type not in result:
                result[record_type] = []

            # split the column-aligned text according to the row specification

            # The .DAT format allows a fixed width for each column of each recordtype.
            # Should we strip the whitespace on shorter values? This is difficult.
            # In general, yes we should, because values are stored as fixed-width and where 
            # shorter than the field, are padded with spaces, which would take up unnecessary space 
            # and would prevent joining/comparison between surveys. 
            # HOWEVER in the case of the CASEID / HHID variables we must NOT strip the whitespace. 
            # The HHID is usually the CASEID with the last 3 chars trimmed off, but if we
            # trim "some" whitespace from HHID here then we can break that association and
            # damage referential integrity.
            # On the other hand some joins are based on e.g. BIDX (recorded as len 2)
            # to MIDX (recorded as len 1, despite containing the same data), and we need
            # to join on a single digit found in both so BIDX would need to be stripped.

            # Define a lambda to strip or not strip accordingly, and use it in a list comp to
            # split the row into its field values
            strip_or_not = lambda data, name: data if name in ('CASEID', 'HHID') else data.strip()
            rowParts = [strip_or_not(
                (line[i['Start'] - 1: i['Start'] + i['Len'] - 1]),
                i['Name'])
                for i in record_spec]

            if record_type not in n_cols_per_table:
                n_cols_per_table[record_type] = len(rowParts)
            else:
                assert len(rowParts) == n_cols_per_table[record_type]
            # add as a list to the list of rows for this record type
            result[record_type].append(rowParts)  # (",".join(rowParts))

    for record_type, field_infos in rt_field_info.items():
        if not record_type in result:
            print(f"No rows were found for record type {record_type} in file {filecode} despite DCF specification")
            continue
        field_header = [i['Name'] for i in field_infos]
        field_records = set([i['RecordName'] for i in field_infos])
        assert len(field_records) == 1
        rec_name = field_records.pop()
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)
        out_fn = os.path.join(out_folder, f"{filecode}.{rec_name}.csv")
        with open(out_fn, 'w', newline='', encoding='utf-8') as out_csv:
            csv_writer = csv.writer(out_csv)
            csv_writer.writerow(field_header)
            csv_writer.writerows(result[record_type])
Example #36
0
class autoSettings(object):
    def __init__(self):
        # возможные разделители строк (байты, потому что разделитель строк находим в строке байтов)
        self.row_separators = [b'\r\n', b'\n\r', b'\r', b'\n']
        # возможные разделители колонок
        self.column_separators = pd.Series([' ', ';', ',', '\t', '.', '\s+'])
        # детектор для определения кодировки
        self.detector = UniversalDetector()
        # разделитель строк
        self.column_sep = ''
        # разделитель колонок
        self.row_sep = ''
        # десятичный разделитель
        self.decimal_sep = ''
        # кодировка файла
        self.code_standart = ''
        # число строк с мусором после заголовка
        self.rubbish_lines_afterHead = 0
        # число строк заголовка
        self.head_lines = 0
        # число строк с мусором до заголовка
        self.rubbish_lines_toHead = 0
        # число значащих строк данных
        self.meaning_data_lines = 0

    # находим первый разделитель в файле
    def __separator(self, byteString):
        for sep in self.row_separators:
            if (sep in byteString) == True:
                return sep
        return None

    # проверяем, остались ли еще разделители в файле после разделения на строки заданным разделителем
    def __separatorChecking(self, separator, byteString):
        splittedS = byteString.split(sep=separator)
        for s in splittedS:
            if self.__separator(splittedS) != None:
                return False
        return True

    # определение разделителя строк
    def __searchRowSeparator(self, string):
        Sep = self.__separator(string)
        if self.__separatorChecking(Sep, string) == True:
            return Sep
        else:
            if b'\r\n' in string and b'\n\r' in string:
                Sep = b'\n\r'
                if self.__separatorChecking(Sep, string) == True:
                    return Sep
            # есть строка с символами перехода на новую строку (либо \n, либо \r)
            _, Sep = max(
                list(
                    zip(map(string.count, self.row_separators),
                        self.row_separators)))
            return Sep

    # делим данные на строки
    def __splitToRows(self, string):
        for sep in self.row_separators:
            string = string.replace(sep, b'\n')
        return string.split(sep=b'\n')

    # определеяем кодировку
    def __codeStandart(self, byte_rows):
        # определение кодировки
        for line in byte_rows:
            self.detector.feed(line)
            if self.detector.done == True:
                break
        self.detector.close()
        code = self.detector.result['encoding']
        return code

    # определяем, является ли заданная строка, строкой с числовыми данными
    def __isStringOfNumbers(self, string, column_separator):
        meaning_data_line = [
            ch for ch in re.split(column_separator, string) if ch != ''
        ]
        for number in meaning_data_line:
            try:
                float(number)
            except ValueError:
                try:
                    float(
                        number.replace(',', '').replace(':', '').replace(
                            ' ', '').replace('.', '').replace('-', ''))
                except ValueError:
                    return False
        return True

    # делим строку на столбцы
    def __splitToColumns(self, string):
        # убираем пустые строки, которые получается если несколько разделителей идут подряд
        columns = [ch for ch in re.split(self.column_sep, string) if ch != '']
        if string != '' and re.split(
                self.column_sep,
                string).count('') != 0 and self.column_sep in [' ', '\t']:
            self.column_sep = '\s+'
        return columns

    # делим строку на столбцы с заданным разделителем
    def __splitToColumns_specSep(self, string, column_separator):
        # убираем пустые строки, которые получается если несколько разделителей идут подряд
        columns = [ch for ch in re.split(column_separator, string) if ch != '']
        return columns

    # определяем число строк с мусором в конце файла
    # функция работает с перевернутым списком строк
    def __rubbish_afterData(self, rows_of_data_reverse):
        # определение количества строк мусора после значащих данных
        rubbishRows_afterMeaningData = -1
        number_of_columns = -1
        for line in rows_of_data_reverse:
            # число столбцов в строке
            count = len(self.__splitToColumns(line))
            if count != 1 and count == number_of_columns and self.__isStringOfNumbers(
                    line, self.column_sep) == True:
                return rubbishRows_afterMeaningData, number_of_columns
            else:
                rubbishRows_afterMeaningData = rubbishRows_afterMeaningData + 1
                number_of_columns = count
        # количество строк мусора после значащих данных
        return rubbishRows_afterMeaningData, number_of_columns

    # определяем есть ли в строке буквы
    def __haveStringLettersOrDigits(self, string):
        for cell in self.__splitToColumns(string):
            if re.search(r'[^\W_]', string) is None:
                return False
            else:
                continue
        return True

    # определяем число строк с мусором после заголовка
    # функция работает с перевернутым списокмо строк
    def __rubbish_afterHead(self, rows_of_data_reverse, number_of_columns):
        rubbishRows_afterHead = 0
        # находим начало строк с значащими данными
        for i, line in enumerate(rows_of_data_reverse):
            # число столбцов в строке
            count = len(self.__splitToColumns(line))
            if count == number_of_columns and self.__isStringOfNumbers(
                    line, self.column_sep) == True:
                continue
            else:
                rows_of_data_reverse = rows_of_data_reverse[i:]
                break
        else:
            rows_of_data_reverse = []

        for line in rows_of_data_reverse:
            # число столбцов в строке
            count = len(self.__splitToColumns(line))
            if count == number_of_columns and self.__haveStringLettersOrDigits(
                    line) == True:
                return rubbishRows_afterHead, rows_of_data_reverse
            else:
                rubbishRows_afterHead = rubbishRows_afterHead + 1
        # количество строк мусора после заголовка
        return rubbishRows_afterHead, rows_of_data_reverse

    # определяем число строк заголовка
    # функция работает с перевернутым списокмо строк
    def __headRows(self, rows_of_data_reverse, number_of_columns):
        # определение количества строк заголовка
        headRows = 0
        for line in rows_of_data_reverse:
            count = len(self.__splitToColumns(line))
            if count == number_of_columns and self.__haveStringLettersOrDigits(
                    line) == True:
                headRows = headRows + 1
            else:
                return headRows
        return headRows

    # # определяем число строк мусора до заголовка
    # def __rubbish_toHead(self, rows_of_data_reverse):
    #     return len(rows_of_data_reverse)

    # определение десятичного разделителя
    def __decimalSeparator(self, numbers):
        for number in numbers:
            try:
                float(number)
                continue
            except ValueError:
                try:
                    float(number.replace(',', '.'))
                except ValueError:
                    continue
                else:
                    return ','
        return '.'

    #определение разделителя колонок
    # начинает поиск с последней строки
    def __searchColumnSeparator(self, rows_of_data):
        for i, line in enumerate(rows_of_data):
            # ищем начало строк со значащими данными (пропускаем строки с мусором)
            #если в строке есть буквы или любые другие символы, кроме указанных в квадратных скобках, переходим к следующей строке
            if re.search(r'[^\d\t- :;.,e]', line) is None and line != '':
                rows_of_data = rows_of_data[i:]
                break
            else:
                continue

        columns_sep = ' '
        columns_count = -1
        #начинаем поиск разделителя колонок с последней строки
        column_separators = self.column_separators.copy()
        for line in rows_of_data:
            if columns_count == len(
                    self.__splitToColumns_specSep(line, columns_sep)):
                break
            else:
                l = zip(
                    list(column_separators.keys()),
                    list(
                        map(
                            lambda x: len(
                                self.__splitToColumns_specSep(line, x)),
                            column_separators)))
                for i, count in l:
                    if count <= 1 or self.__isStringOfNumbers(
                            line, column_separators[i]) == False:
                        del column_separators[i]
                    else:
                        if (columns_sep == ' '
                                or columns_sep == '\t') and re.search(
                                    r'(\s\s)', line) is not None:
                            columns_sep = r'\s+'
                            columns_count = len(
                                self.__splitToColumns_specSep(
                                    line, columns_sep))
                        else:
                            columns_sep = self.column_separators[i]
                            columns_count = count
                        break
        return columns_sep

    def get_auto_settings(self, filename):
        # открываем файл в байтовом режиме
        with open(filename, 'rb') as dataBytes:
            s = dataBytes.read()

            dataRows_begin = s[:10000]
            dataRows_end = s[-10000:]

            # определение разделителя строк
            self.row_sep = self.__searchRowSeparator(dataRows_begin)
            # делим на строки
            dataRows_begin = self.__splitToRows(dataRows_begin)
            dataRows_end = self.__splitToRows(dataRows_end)

            # опредеялем кодировку файла
            self.code_standart = self.__codeStandart(dataRows_begin +
                                                     dataRows_end)

            #декодируем данные
            dataRows_begin = [
                x.decode(self.code_standart) for x in dataRows_begin
            ]
            dataRows_end = [x.decode(self.code_standart) for x in dataRows_end]

            # удаляем лишние пробелы с начала и конца строк
            # удаляем последнюю и первые строки соответственно, так как они могут быть неполными (так как читается N знаков,
            # а не определенное число строк)
            dataRows_begin = (list(map(lambda x: x.strip(),
                                       dataRows_begin)))[:-1]
            dataRows_end = (list(map(lambda x: x.strip(), dataRows_end)))[1:]

            # определение разделителя колонок
            # ищем с конца файла!!!!
            dataRows_end_reversed = dataRows_end.copy()
            dataRows_end_reversed.reverse()
            self.column_sep = self.__searchColumnSeparator(
                dataRows_end_reversed)

            # число строк с мусором после данных
            # так же с конца файла!!!!
            rubbish_lines_afterData, number_of_columns = self.__rubbish_afterData(
                dataRows_end_reversed)
            # число строк с мусором после заголовка
            # работаем с перевернутым файлом!!!!
            dataRows_begin_reversed = dataRows_begin.copy()
            dataRows_begin_reversed.reverse()
            self.rubbish_lines_afterHead, dataRows_begin_reversed = self.__rubbish_afterHead(
                dataRows_begin_reversed, number_of_columns)

            # число строк заголовка
            # работаем с перевернутым файлом!!!!
            self.head_lines = self.__headRows(
                dataRows_begin_reversed[self.rubbish_lines_afterHead:],
                number_of_columns)

            # число строк с мусором до заголовка
            # работаем с перевернутым файлом!!!!
            self.rubbish_lines_toHead = len(
                dataRows_begin_reversed[self.head_lines +
                                        self.rubbish_lines_afterHead:])

            # число строк значащих данных
            self.meaning_data_lines = len(
                self.__splitToRows(s)
            ) - self.rubbish_lines_toHead - self.rubbish_lines_afterHead - rubbish_lines_afterData - self.head_lines

            # десятичный разделитель
            # функция работает со строкой, в которой находятся значащие числовые данные
            decimal_sep = self.__decimalSeparator(
                self.__splitToColumns(
                    dataRows_end_reversed[rubbish_lines_afterData]))

        return dict(
            column_separator=self.column_sep,
            row_separator=self.row_sep.decode(self.code_standart),
            decimal_separator=decimal_sep,
            code_standart=self.code_standart,
            number_of_rows_with_rubbish_toHead=self.rubbish_lines_toHead,
            number_of_head_lines=self.head_lines,
            number_of_rows_with_rubbish_afterHead=self.rubbish_lines_afterHead,
            number_of_rows_with_meaningful_data=self.meaning_data_lines)

    # возвращет True если все хорошо,  и False, если файл кривой
    def check_settings(self, filename, settings):
        column_sep = settings['column_separator']
        row_sep = settings['row_separator']
        decimal_sep = settings['decimal_separator']
        code_std = settings['code_standart']
        rubbish_toHead = settings['number_of_rows_with_rubbish_toHead']
        rubbish_afterHead = settings['number_of_rows_with_rubbish_afterHead']
        head = settings['number_of_head_lines']
        meaning_data = settings['number_of_rows_with_meaningful_data']

        if column_sep == '\s+':
            engine = 'python'
        else:
            engine = 'c'

        try:
            header = pd.read_csv(
                filename,
                sep=column_sep,
                engine=engine,
                decimal=decimal_sep,  #lineterminator=row_sep,
                warn_bad_lines=True,
                header=None,
                skiprows=rubbish_toHead,
                nrows=head,
                encoding=code_std)

            data = pd.read_csv(
                filename,
                sep=column_sep,
                header=list(range(head)),
                engine=engine,
                decimal=decimal_sep,  #lineterminator=row_sep,
                warn_bad_lines=True,
                skip_blank_lines=False,
                skiprows=list(range(rubbish_toHead)) + list(
                    range(rubbish_toHead + head,
                          rubbish_toHead + head + rubbish_afterHead)),
                #list(range(rubbish_toHead+head+rubbish_afterHead)),
                nrows=meaning_data + head,
                encoding=code_std)

        except Exception:
            print('Exception')
            return False

        if (None not in data.values or head == 0) and len(data.columns) > 1:
            return True
        else:
            return False

    def get_data(self, filename):
        settings = self.get_auto_settings(filename)
        if self.check_settings(filename, settings) == False:
            return None
        else:
            column_sep = settings['column_separator']
            row_sep = settings['row_separator']
            decimal_sep = settings['decimal_separator']
            code_std = settings['code_standart']
            rubbish_toHead = settings['number_of_rows_with_rubbish_toHead']
            rubbish_afterHead = settings[
                'number_of_rows_with_rubbish_afterHead']
            head = settings['number_of_head_lines']
            meaning_data = settings['number_of_rows_with_meaningful_data']

            if column_sep == '\s+':
                engine = 'python'
            else:
                engine = 'c'

            header = pd.read_csv(
                filename,
                sep=column_sep,
                engine=engine,
                decimal=decimal_sep,
                # lineterminator=row_sep,
                warn_bad_lines=True,
                header=None,
                skiprows=rubbish_toHead,
                nrows=head,
                encoding=code_std)

            data = pd.read_csv(
                filename,
                sep=column_sep,
                header=None,
                engine=engine,
                decimal=decimal_sep,  # lineterminator=row_sep,
                warn_bad_lines=True,
                skip_blank_lines=False,
                skiprows=rubbish_toHead + head + rubbish_afterHead,
                nrows=meaning_data,
                encoding=code_std)
            return header.values, data.values
Example #37
0
    def determineEncoding(self, chardet=True):
        # BOMs take precedence over everything
        # This will also read past the BOM if present
        charEncoding = self.detectBOM(), "certain"
        if charEncoding[0] is not None:
            return charEncoding

        # If we've been overriden, we've been overriden
        charEncoding = lookupEncoding(self.override_encoding), "certain"
        if charEncoding[0] is not None:
            return charEncoding

        # Now check the transport layer
        charEncoding = lookupEncoding(self.transport_encoding), "certain"
        if charEncoding[0] is not None:
            return charEncoding

        # Look for meta elements with encoding information
        charEncoding = self.detectEncodingMeta(), "tentative"
        if charEncoding[0] is not None:
            return charEncoding

        # Parent document encoding
        charEncoding = lookupEncoding(
            self.same_origin_parent_encoding), "tentative"
        if charEncoding[0] is not None and not charEncoding[0].name.startswith(
                "utf-16"):
            return charEncoding

        # "likely" encoding
        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding

        # Guess with chardet, if available
        if chardet:
            try:
                from chardet.universaldetector import UniversalDetector
            except ImportError:
                pass
            else:
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
                if encoding is not None:
                    return encoding, "tentative"

        # Try the default encoding
        charEncoding = lookupEncoding(self.default_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding

        # Fallback to html5lib's default if even that hasn't worked
        return lookupEncoding("windows-1252"), "tentative"
Example #38
0
def getEncodingByContent(content):
    detector = UniversalDetector()
    detector.feed(content)
    detector.close()
    return detector.result["encoding"]
Example #39
0
def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"):
    """
    Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``.

    Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values.
    """

    url = os.path.basename(infile)
    # Get the current date and time (UTC)
    today = datetime.datetime.utcnow().strftime("%Y-%m-%d")

    if dataset_name is None:
        dataset_name = url

    if encoding is None:
        detector = UniversalDetector()
        with open(infile, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()
        encoding = detector.result['encoding']
        logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'],
                                                                   detector.result['confidence']))

    if delimiter is None:
        try: #Python 3
            with open(infile, 'r', errors='ignore') as csvfile:
                # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
                dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
                csvfile.seek(0)
        except TypeError: #Python 2
            with open(infile, 'r') as csvfile:
                # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t")
                dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter
                csvfile.seek(0)
        logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter))
        delimiter = dialect.delimiter


    logger.info("Delimiter is: {}".format(delimiter))

    if base.endswith('/'):
        base = base[:-1]

    metadata = {
        u"@id": iribaker.to_iri(u"{}/{}".format(base, url)),
        u"@context": [u"https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json",
                     {u"@language": u"en",
                      u"@base": u"{}/".format(base)},
                     get_namespaces(base)],
        u"url": url,
        u"dialect": {u"delimiter": delimiter,
                    u"encoding": encoding,
                    u"quoteChar": quotechar
                    },
        u"dc:title": dataset_name,
        u"dcat:keyword": [],
        u"dc:publisher": {
            u"schema:name": u"CLARIAH Structured Data Hub - Datalegend",
            u"schema:url": {u"@id": u"http://datalegend.net"}
        },
        u"dc:license": {u"@id": u"http://opendefinition.org/licenses/cc-by/"},
        u"dc:modified": {u"@value": today, u"@type": u"xsd:date"},
        u"tableSchema": {
            u"columns": [],
            u"primaryKey": None,
            u"aboutUrl": u"{_row}"
        }
    }

    with io.open(infile, 'rb') as infile_file:
        r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar)

        try:
            # Python 2
            header = r.next()
        except AttributeError:
            # Python 3
            header = next(r)

        logger.info(u"Found headers: {}".format(header))

        if u'' in header:
            logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse")
        if len(set(header)) < len(header):
            logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse")

        # First column is primary key
        metadata[u'tableSchema'][u'primaryKey'] = header[0]

        for head in header:
            col = {
                u"@id": iribaker.to_iri(u"{}/{}/column/{}".format(base, url, head)),
                u"name": head,
                u"titles": [head],
                u"dc:description": head,
                u"datatype": u"string"
            }

            metadata[u'tableSchema'][u'columns'].append(col)

    with open(outfile, 'w') as outfile_file:
        outfile_file.write(json.dumps(metadata, indent=True))

    logger.info("Done")
    return
Example #40
0
def read_into_dataframe(file: IO,
                        filename: str = "",
                        nrows: int = 100,
                        max_characters: int = 50) -> pd.DataFrame:
    """Reads a file into a DataFrame.
    Infers the file encoding and whether a header column exists
    Args:
        file (IO): file buffer.
        filename (str): filename. Used to infer compression.
        nrows (int, optional): number of rows to peek. Default: 100.
        max_characters (int, optional): max characters a column name can have to be distinguished from a real text value
    Returns:
        A pandas.DataFrame.
    """
    detector = UniversalDetector()
    for line, text in enumerate(file):
        detector.feed(text)
        if detector.done or line > nrows:
            break
    detector.close()
    encoding = detector.result.get("encoding")

    compression = infer_compression(filename, "infer")

    file.seek(0, SEEK_SET)
    contents = file.read()

    with BytesIO(contents) as file:
        df0 = pd.read_csv(
            file,
            encoding=encoding,
            compression=compression,
            sep=None,
            engine="python",
            header="infer",
            nrows=nrows,
        )

    df0_cols = list(df0.columns)

    #Check if all columns are strins and short strings(text values tend to be long)
    column_names_checker = all([type(item) == str for item in df0_cols])
    if column_names_checker:
        column_names_checker = all(
            [len(item) < max_characters for item in df0_cols])

    #Check if any column can be turned to float
    conversion_checker = True
    for item in df0_cols:
        try:
            item = float(item)
            conversion_checker = False
            break
        except:
            pass

    #Prefix and header
    final_checker = True if (column_names_checker
                             and conversion_checker) else False
    header = "infer" if final_checker else None
    prefix = None if header else "col"

    with BytesIO(contents) as file:
        df = pd.read_csv(
            file,
            encoding=encoding,
            compression=compression,
            sep=None,
            engine="python",
            header=header,
            prefix=prefix,
        )
    return df
Example #41
0
    def show_encode(text):
        detector = UniversalDetector()
        detector.feed(text)
        detector.close()

        return detector.result['encoding']
Example #42
0
with open('test.txt', 'w') as file:
    for line in LINES_LST:
        file.write(f'{line}\n')
file.close()

# узнаем кодировку файла

"""
Если файл имеет большой размер, то вместо считывания его целиком в строку
и использования функции detect() можно воспользоваться классом UniversalDetector.
В этом случае можно читать файл построчно и передавать текущую строку методу feed().
Если определение кодировки прошло успешно, атрибут done будет иметь значение True.
Это условие можно использовать для выхода из цикла.
После окончания проверки следует вызвать метод close().
Получить результат определения кодировки позволяет атрибут result
"""

DETECTOR = UniversalDetector()
with open('test.txt', 'rb') as test_file:
    for i in test_file:
        DETECTOR.feed(i)
        if DETECTOR.done:
            break
    DETECTOR.close()
print(DETECTOR.result['encoding'])

# открываем файл в правильной кодировке
with open('test.txt', 'r', encoding=DETECTOR.result['encoding']) as file:
    CONTENT = file.read()
print(CONTENT)
    def process(self, identifier, infile):

        # Need to use some guesswork to detect the file encoding of the latex files.
        u = UniversalDetector()
        for line in open(infile, 'rb'):
            u.feed(line)
        u.close()
        result = u.result
        if result['encoding']:
            encoding = result['encoding']
            print "Detected encoding for %s: %s" % (identifier, encoding)
        else:
            encoding = self.DEFAULT_ENCODING
            print "Warning: using default encoding (%s) - as a file encoding could not be detected for %s" % (
                encoding, infile)

        file_handle = codecs.open(infile, encoding=encoding, mode='r')

        try:
            #Always re-encode files as UTF-8 for processing, as this is what ElasticSearch is expecting
            raw_data = file_handle.read().encode("UTF-8")
        except UnicodeDecodeError as e:
            #Otherwise, give up trying to read this file
            print "Error: could not re-encode %s to UTF-8: %s %s" % (
                identifier, e, infile)
            raw_data = ""

        file_handle.close()
        citations = []

        #remove latex comments, to avoid confusion in processing
        data = re.sub(r"^\s*\%.*$", "", raw_data, 0, re.MULTILINE)

        #remove whitespace and newlines
        data = re.sub(r"\s+", " ", data, 0, re.MULTILINE)

        #find the bibliography section
        match = re.search(
            r'\\begin{thebibliography(?P<bibliography>.*)\\end{thebibliography',
            data, re.DOTALL)
        if match:
            data = match.group('bibliography')

            #get a list of bibitems. Start at [1:] to ignore the stuff between \begin{thebibliography} and \bibitem
            counter = 1
            for bibitem in re.split(r"\\bibitem", data)[1:]:

                #trim the string
                bibitem_trimmed = bibitem.strip()
                citation = {"_latex": bibitem_trimmed, "cite_order": counter}

                bibstring_to_process = bibitem_trimmed

                (arxiv_id, bibstring_to_process
                 ) = self.extract_arxiv_id(bibstring_to_process)
                if arxiv_id is not None:
                    citation["identifier"] = [{
                        "type": "arXiv",
                        "id": arxiv_id,
                        "canonical": "arXiv:" + arxiv_id
                    }]

                (label, key, bibstring_to_process
                 ) = self.extract_label_key(bibstring_to_process)
                if (label is not None): citation["label"] = label
                if (key is not None): citation["key"] = key

                (url,
                 bibstring_to_process) = self.extract_url(bibstring_to_process)
                if (url is not None): citation["url"] = url

                (year, bibstring_to_process
                 ) = self.extract_year(bibstring_to_process)
                if (year is not None): citation["year"] = year

                (authors, bibstring_to_process
                 ) = self.extract_authors(bibstring_to_process)
                if (authors is not None): citation["authors"] = authors

                (title, bibstring_to_process
                 ) = self.extract_title(bibstring_to_process)
                if (title is not None): citation["title"] = title

                (publisher, bibstring_to_process
                 ) = self.extract_publisher(bibstring_to_process)
                if (publisher is not None): citation["publisher"] = publisher

                citations.append(citation)

                #print "Counter: %i\tarxiv_id: %s\tlabel: %s\tkey: %s\turl: %s\tauthors: %s\ttitle: %s" % (counter, arxiv_id, label, key, url, authors, title)
                #print "COUNTER: %i \t AUTHORS: %s \t TITLE: %s" % (counter, authors, title)
                #print "bibstring_to_process:\t", bibstring_to_process
                #print bibitem, "\n"
                counter += 1

        return citations
Example #44
0
def parse_csv(myfile, newsletter, ignore_errors=False):
    from newsletter.addressimport.csv_util import UnicodeReader
    import codecs
    import csv

    # Detect encoding
    from chardet.universaldetector import UniversalDetector

    detector = UniversalDetector()

    for line in myfile.readlines():
        detector.feed(line)
        if detector.done:
            break

    detector.close()
    charset = detector.result['encoding']

    # Reset the file index
    myfile.seek(0)

    # Attempt to detect the dialect
    encodedfile = codecs.EncodedFile(myfile, charset)
    dialect = csv.Sniffer().sniff(encodedfile.read(1024))

    # Reset the file index
    myfile.seek(0)

    logger.info('Detected encoding %s and dialect %s for CSV file', charset,
                dialect)

    myreader = UnicodeReader(myfile, dialect=dialect, encoding=charset)

    firstrow = myreader.next()

    # Find name column
    colnum = 0
    namecol = None
    for column in firstrow:
        if "name" in column.lower() or ugettext("name") in column.lower():
            namecol = colnum

            if "display" in column.lower() or \
                    ugettext("display") in column.lower():
                break

        colnum += 1

    if namecol is None:
        raise forms.ValidationError(
            _("Name column not found. The name of this column should be "
              "either 'name' or '%s'.") % ugettext("name"))

    logger.debug("Name column found: '%s'", firstrow[namecol])

    # Find email column
    colnum = 0
    mailcol = None
    for column in firstrow:
        if 'email' in column.lower() or \
                'e-mail' in column.lower() or \
                ugettext("e-mail") in column.lower():

            mailcol = colnum

            break

        colnum += 1

    if mailcol is None:
        raise forms.ValidationError(
            _("E-mail column not found. The name of this column should be "
              "either 'email', 'e-mail' or '%(email)s'.") %
            {'email': ugettext("e-mail")})

    logger.debug("E-mail column found: '%s'", firstrow[mailcol])

    #assert namecol != mailcol, \
    #    'Name and e-mail column should not be the same.'
    if namecol == mailcol:
        raise forms.ValidationError(
            _("Could not properly determine the proper columns in the "
              "CSV-file. There should be a field called 'name' or "
              "'%(name)s' and one called 'e-mail' or '%(e-mail)s'.") % {
                  "name": _("name"),
                  "e-mail": _("e-mail")
              })

    logger.debug('Extracting data.')

    addresses = {}
    for row in myreader:
        if not max(namecol, mailcol) < len(row):
            logger.warn("Column count does not match for row number %d",
                        myreader.line_num,
                        extra=dict(data={'row': row}))

            if ignore_errors:
                # Skip this record
                continue
            else:
                raise forms.ValidationError(
                    _("Row with content '%(row)s' does not contain a name and "
                      "email field.") % {'row': row})

        name = check_name(row[namecol], ignore_errors)
        email = check_email(row[mailcol], ignore_errors)

        logger.debug("Going to add %s <%s>", name, email)

        try:
            validate_email(email)
            addr = make_subscription(newsletter, email, name)
        except ValidationError:
            if ignore_errors:
                logger.warn(
                    "Entry '%s' at line %d does not contain a valid "
                    "e-mail address.",
                    name,
                    myreader.line_num,
                    extra=dict(data={'row': row}))
            else:
                raise forms.ValidationError(
                    _("Entry '%s' does not contain a valid "
                      "e-mail address.") % name)

        if addr:
            if email in addresses:
                logger.warn(
                    "Entry '%s' at line %d contains a "
                    "duplicate entry for '%s'",
                    name,
                    myreader.line_num,
                    email,
                    extra=dict(data={'row': row}))

                if not ignore_errors:
                    raise forms.ValidationError(
                        _("The address file contains duplicate entries "
                          "for '%s'.") % email)

            addresses.update({email: addr})
        else:
            logger.warn(
                "Entry '%s' at line %d is already subscribed to "
                "with email '%s'",
                name,
                myreader.line_num,
                email,
                extra=dict(data={'row': row}))

            if not ignore_errors:
                raise forms.ValidationError(
                    _("Some entries are already subscribed to."))

    return addresses
Example #45
0
    def GetFileBody(self, get):
        if sys.version_info[0] == 2: get.path = get.path.encode('utf-8')
        if not os.path.exists(get.path):
            if get.path.find('rewrite') == -1:
                return public.returnMsg(False, 'FILE_NOT_EXISTS', (get.path, ))
            public.writeFile(get.path, '')

        if os.path.getsize(get.path) > 2097152:
            return public.returnMsg(False, 'CANT_EDIT_ONLINE_FILE')
        fp = open(get.path, 'rb')
        data = {}
        data['status'] = True

        try:
            if fp:
                from chardet.universaldetector import UniversalDetector
                detector = UniversalDetector()
                srcBody = b""
                for line in fp.readlines():
                    detector.feed(line)
                    srcBody += line
                detector.close()
                char = detector.result
                data['encoding'] = char['encoding']
                if char['encoding'] == 'GB2312' or not char[
                        'encoding'] or char['encoding'] == 'TIS-620' or char[
                            'encoding'] == 'ISO-8859-9':
                    data['encoding'] = 'GBK'
                if char['encoding'] == 'ascii' or char[
                        'encoding'] == 'ISO-8859-1':
                    data['encoding'] = 'utf-8'
                if char['encoding'] == 'Big5': data['encoding'] = 'BIG5'
                if not char['encoding'] in ['GBK', 'utf-8', 'BIG5']:
                    data['encoding'] = 'utf-8'
                try:
                    if sys.version_info[0] == 2:
                        data['data'] = srcBody.decode(data['encoding']).encode(
                            'utf-8', errors='ignore')
                    else:
                        data['data'] = srcBody.decode(data['encoding'])
                except:
                    data['encoding'] = char['encoding']
                    if sys.version_info[0] == 2:
                        data['data'] = srcBody.decode(data['encoding']).encode(
                            'utf-8', errors='ignore')
                    else:
                        data['data'] = srcBody.decode(data['encoding'])
            else:
                if sys.version_info[0] == 2:
                    data['data'] = srcBody.decode('utf-8').encode('utf-8')
                else:
                    data['data'] = srcBody.decode('utf-8')
                data['encoding'] = u'utf-8'

            return data
        except Exception as ex:
            return public.returnMsg(
                False,
                'INCOMPATIBLE_FILECODE',
                (str(ex)),
            )
Example #46
0
    def __init__(self, open_file):
        try:
            detector = UniversalDetector()

            for line in open_file:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result['encoding']
            if encoding == 'UTF-8-SIG':
                encoding = 'utf-8'
            parser = ET.XMLParser(encoding=encoding)
        except TypeError:
            parser = None

        open_file.seek(0)
        iterator = ET.iterparse(open_file, parser=parser)

        self.services = {}

        # element = None
        serviced_organisations = None

        journey_pattern_sections = {}

        for _, element in iterator:
            tag = element.tag[33:]

            if tag == 'StopPoints':
                stops = (Stop(stop) for stop in element)
                self.stops = {stop.atco_code: stop for stop in stops}
                element.clear()
            elif tag == 'Routes':
                # routes = {
                #     route.get('id'): route.find('txc:Description', NS).text
                #     for route in element
                # }
                element.clear()
            elif tag == 'RouteSections':
                element.clear()
            elif tag == 'Operators':
                self.operators = element
            elif tag == 'JourneyPatternSections':
                for section in element:
                    section = JourneyPatternSection(section, self.stops)
                    if section.timinglinks:
                        journey_pattern_sections[section.id] = section
                element.clear()
            elif tag == 'ServicedOrganisations':
                serviced_organisations = (ServicedOrganisation(child)
                                          for child in element)
                serviced_organisations = {
                    organisation.code: organisation
                    for organisation in serviced_organisations
                }
            elif tag == 'VehicleJourneys':
                try:
                    self.journeys = self.__get_journeys(
                        element, serviced_organisations)
                except (AttributeError, KeyError) as e:
                    logger.error(e, exc_info=True)
                    return
                element.clear()
            elif tag == 'Service':
                service = Service(element, serviced_organisations,
                                  journey_pattern_sections)
                self.services[service.service_code] = service
            elif tag == 'Garages':
                # print(ET.tostring(element).decode())
                element.clear()

        self.element = element

        self.transxchange_date = max(
            element.attrib['CreationDateTime'],
            element.attrib['ModificationDateTime'])[:10]
Example #47
0
def analyse_HTTP(serveurHTTP, protocole_serveur, port_serveur,
                 wordlist_bruteforce, extensions_a_bruteforcer, nb_processus):

    liste_dossiers_a_powned = ["/"]
    liste_dossiers_deja_testes = []

    protocole_serveur = protocole_serveur + "://"
    contenu_robots_txt = requests.get(protocole_serveur + serveurHTTP +
                                      "/robots.txt")
    if contenu_robots_txt.status_code == 200:
        print("Contenu du fichier robots.txt du serveur %s :" % serveurHTTP)
        for line in (contenu_robots_txt.iter_lines()):
            if "Disallow:" in str(line):
                ligne_robots_string = str(line)
                fichier_ou_dossier_robot = ligne_robots_string[(
                    ligne_robots_string.find("/")):(
                        ligne_robots_string.rfind("'"))]
                print(fichier_ou_dossier_robot)
                if "." not in fichier_ou_dossier_robot or "/" in fichier_ou_dossier_robot:
                    test_requete = requests.get(protocole_serveur +
                                                serveurHTTP +
                                                fichier_ou_dossier_robot + "/")
                    if test_requete.status_code != 404 and fichier_ou_dossier_robot[
                            -1] == "/":
                        liste_dossiers_a_powned.append(
                            fichier_ou_dossier_robot)
                    else:
                        liste_dossiers_a_powned.append(
                            fichier_ou_dossier_robot + "/")
    else:
        print("Le serveur %s ne contient pas de fichier robots.txt" %
              serveurHTTP)

    verification_gestion_fichier_non_present = requests.get(
        protocole_serveur + serveurHTTP + ":" + str(port_serveur) +
        "/TESTERREUR404_ElTito.php")
    if verification_gestion_fichier_non_present.status_code != 404:
        print(
            "Le serveur %s ne retourne pas d'erreur 404 si on essaye d'accéder à un fichier inexistant"
            % serveurHTTP)
        exit(0)

    verification_methode_http = requests.head(protocole_serveur + serveurHTTP +
                                              ":" + str(port_serveur))
    if verification_methode_http.status_code != 405:
        methode_http_a_utiliser = "HEAD"
    else:
        methode_http_a_utiliser = "GET"

    nb_lignes_wordlist = 0

    detection_encodage = UniversalDetector()
    detection_encodage.reset()

    with open(wordlist_bruteforce, mode='rb') as e:
        for b in e:
            detection_encodage.feed(b)
            if detection_encodage.done:
                break
    detection_encodage.close()

    encodage_wordlist = detection_encodage.result["encoding"]

    with open(wordlist_bruteforce, encoding=encodage_wordlist) as f:
        for line in f:
            nb_lignes_wordlist = nb_lignes_wordlist + 1

    nb_lignes_a_lire_par_processus = nb_lignes_wordlist // nb_processus

    manager = multiprocessing.Manager()
    dictionnaire_fichier_decouverts = manager.dict()
    dictionnare_dossiers_decouverts = manager.dict()
    liste_dossier_directory_indexing = manager.list()

    while (len(liste_dossiers_a_powned)) > 0:

        jobs = []

        for i in range(nb_processus):
            tache = multiprocessing.Process(
                target=brute_force_http,
                args=(serveurHTTP, protocole_serveur, port_serveur,
                      wordlist_bruteforce, liste_dossiers_a_powned[0],
                      extensions_a_bruteforcer, i,
                      nb_lignes_a_lire_par_processus, encodage_wordlist,
                      methode_http_a_utiliser, dictionnaire_fichier_decouverts,
                      dictionnare_dossiers_decouverts,
                      liste_dossier_directory_indexing))
            jobs.append(tache)
            tache.start()

        for proc in jobs:
            proc.join()

        liste_dossiers_deja_testes.append(liste_dossiers_a_powned[0])

        for dossier in dictionnare_dossiers_decouverts.keys():
            if ".htpasswd" in dossier or ".htaccess" in dossier:
                dictionnaire_fichier_decouverts[
                    dossier] = dictionnare_dossiers_decouverts.get(dossier)
                del dictionnare_dossiers_decouverts[dossier]
            elif dossier == "/server-status":
                dictionnaire_fichier_decouverts[
                    dossier] = dictionnare_dossiers_decouverts.get(dossier)
                del dictionnare_dossiers_decouverts[dossier]
            elif (dossier + "/") not in liste_dossiers_deja_testes and (
                    dossier + "/") not in liste_dossiers_a_powned:
                liste_dossiers_a_powned.append(dossier + "/")

        liste_dossiers_a_powned.remove(liste_dossiers_a_powned[0])

    print(dictionnaire_fichier_decouverts)
    print(dictionnare_dossiers_decouverts)
    print(liste_dossier_directory_indexing)

    for page, message_retour in dictionnaire_fichier_decouverts.items():
        print(page)
        print(message_retour)
        if ".php" in page or ".aspx" in page and message_retour == "200":
            analyse_formulaire_php(serveurHTTP, protocole_serveur,
                                   port_serveur, page)
Example #48
0
def detect_file_encoding(training_file, file_encoding, max_passwords=10000):

    ##Try to import chardet
    #
    # If that package is not installed print out a warning and use is ok,
    # then use ascii as the default values
    #
    try:
        from chardet.universaldetector import UniversalDetector
        detector = UniversalDetector()
    except ImportError as error:
        print("FAILED: chardet not insalled")
        print(
            "It is highly recommended that you install the 'chardet' Python package"
        )
        print(
            "or manually specify the file encoding of the training set via the command line"
        )
        print(
            "You can download chardet from https://pypi.python.org/pypi/chardet"
        )
        if get_confirmation(
                "Do you want to continue using the default encoding 'ascii'?"):
            file_encoding.append('ascii')
            return True

        else:
            # User wanted to exit instead
            print(
                "Understood. Please install chardet or specify an encoding " +
                "format on the command line")
            return False

    try:
        cur_count = 0
        with open(training_file, 'rb') as file:
            for line in file.readlines():
                detector.feed(line)
                if detector.done:
                    break
                cur_count = cur_count + 1
                if cur_count >= max_passwords:
                    break
            detector.close()

    except IOError as error:
        print("Error opening file " + training_file)
        print("Error is " + str(error))
        return False

    try:
        file_encoding.append(detector.result['encoding'])
        print("File Encoding Detected: " + str(detector.result['encoding']))
        print("Confidence for file encoding: " +
              str(detector.result['confidence']))
        print(
            "If you think another file encoding might have been used please ")
        print(
            "manually specify the file encoding and run the training program again"
        )
        print()
    except KeyError as error:
        print("Error encountered with file encoding autodetection")
        print("Error : " + str(error))
        return False

    return True
Example #49
0
 def set_source(self, name):
     # source _dependent_ initialization goes here
     if name is None or not os.path.isfile(name):
         return False
     IP.set_source(self, name)
     self.__source_name = name
     # auto-detect file-encoding (optional)
     try:
         from chardet.universaldetector import UniversalDetector
         detector = UniversalDetector()
         detector.reset()
         lines = 0
         for line in file(self.__source_name, 'rb'):
             detector.feed(line)
             lines += 1
             if detector.done or lines == 50:
                 break
         detector.close()
         encoding = string.lower(detector.result['encoding'])
     except:
         log.exception('')
         encoding = 'utf_8'
     encoding = self._encoding_cleanup.sub('', encoding)
     model = self.gtk.get_widget('e_encoding').get_model()
     itempos = 0
     for item in model:
         pos1 = string.find(
             self._encoding_cleanup.sub('', string.lower(str(item[0]))),
             encoding)
         if pos1 == 0:
             break
         itempos += 1
     self.gtk.get_widget('e_encoding').set_active(itempos)
     # auto-detect CSV import settings (optional)
     try:
         import csv
         sniffer = csv.Sniffer()
         csvfilesize = os.path.getsize(self.__source_name)
         if csvfilesize > 65535:
             csvfilesize = 65535
         csvfile = file(self.__source_name, 'rb')
         try:
             # quote char, line terminator and field delimiter
             proposed_dialect = sniffer.sniff(csvfile.read(csvfilesize))
             self.gtk.get_widget('e_delimiter').set_text(
                 proposed_dialect.delimiter)
             self.gtk.get_widget('e_quotechar').set_text(
                 proposed_dialect.quotechar)
             if proposed_dialect.lineterminator == '\r\n':
                 self.gtk.get_widget('e_lineterminator').set_active(1)
             # first row with column headers
             csvfile.seek(0)
             if sniffer.has_header(csvfile.read(csvfilesize)):
                 self.gtk.get_widget('e_startrow').set_text('1')
             else:
                 self.gtk.get_widget('e_startrow').set_text('0')
         finally:
             csvfile.close()
     except:
         log.exception('')
     # run dialog
     response = self.gtk.get_widget('d_import').run()
     if response == gtk.RESPONSE_OK:
         return True
     else:
         return False
##
toolFileName = sys.argv[0]
if len(toolFileName) <= 0:
    toolDirName = os.path.dirname(os.getcwd())
elif os.path.isdir(toolFileName):
    toolDirName = toolFileName
else:
    toolDirName = os.path.dirname(toolFileName)
print(toolDirName , sys.argv[1])
# encodingの検出ツールを使う。
encode_detector = UniversalDetector()
encode_detector.reset()
raw_doc = open(sys.argv[1].replace('\\', '/'), 'rb').read()
encode_detector.feed(raw_doc)
if encode_detector.done:
    encode_detector.close()
    raw_doc = raw_doc.decode(encode_detector.result['encoding'], errors='ignore' ) # .encode('utf-8', 'ignore')
else:
    encode_detector.close()
    raw_doc = raw_doc.decode('utf-8', errors='ignore') 
doc3 = gensim.parsing.preprocess_string(raw_doc)
raw_doc = None
model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(toolDirName,'data/doc2vec.model'))

# .doc2vec is better Similarity!
# .doc2vecは、隣接する単語の並びをnGram化しているので、文章としての類似度が自然に見える。
new_doc_vec3 = model.infer_vector(doc3)
similarl_docs = sorted(model.docvecs.most_similar([new_doc_vec3], topn=topN),  key=lambda item: -item[1])
print('doc2vec most_similar',len(similarl_docs))
for docName,similarl in similarl_docs:
    print('{:3.5f}'.format(similarl), docName)
Example #51
0
class FileOpener:
    def __init__(self, use_chardet):
        self.use_chardet = use_chardet
        if use_chardet:
            self.init_chardet()

    def init_chardet(self):
        try:
            from chardet.universaldetector import UniversalDetector
        except ImportError:
            raise Exception("There's no chardet installed to import from. "
                            "Please, install it and check your PYTHONPATH "
                            "environment variable")

        self.encdetector = UniversalDetector()

    def open(self, filename):
        if self.use_chardet:
            return self.open_with_chardet(filename)
        else:
            return self.open_with_internal(filename)

    def open_with_chardet(self, filename):
        self.encdetector.reset()
        with open(filename, 'rb') as f:
            for line in f:
                self.encdetector.feed(line)
                if self.encdetector.done:
                    break
        self.encdetector.close()
        encoding = self.encdetector.result['encoding']

        try:
            f = open(filename, encoding=encoding)
            lines = f.readlines()
        except UnicodeDecodeError:
            print('ERROR: Could not detect encoding: %s' % filename,
                                                        file=sys.stderr)
            raise
        except LookupError:
            print('ERROR: %s -- Don\'t know how to handle encoding %s'
                                % (filename, encoding), file=sys.stderr)
            raise
        finally:
            f.close()

        return lines, encoding


    def open_with_internal(self, filename):
        curr = 0
        global encodings

        while True:
            try:
                f = open(filename, 'r', encoding=encodings[curr])
                lines = f.readlines()
                break
            except UnicodeDecodeError:
                if not quiet_level & QuietLevels.ENCODING:
                    print('WARNING: Decoding file %s' % filename,
                                                        file=sys.stderr)
                    print('WARNING: using encoding=%s failed. '
                                                        % encodings[curr],
                                                        file=sys.stderr)
                    print('WARNING: Trying next encoding: %s' % encodings[curr],
                                                        file=sys.stderr)

                curr += 1

            finally:
                f.close()

        if not lines:
            raise Exception('Unknown encoding')

        encoding = encodings[curr]

        return lines, encoding