Ejemplo n.º 1
0
def utf16InfoDecode(b_value):
    """
        b_value is byte array
        returns str, or None (on errors)

        block type = 3
        block format: <2 byte code1><2 byte code2>
        if code2 == 0: then the block ends
        if code2 == 1: then the block continues as follows:
        <4 byte len1> \x00 \x00 <message in utf-16>
        len1 - length of message in 2-byte chars
    """
    if b_value[0] != 0:
        log.warning('utf16InfoDecode: b_value=%s, null expected at 0'%list(b_value))
        return
    
    if b_value[1] == 0:
        if len(b_value) > 2:
            log.warning('utf16InfoDecode: unexpected b_value size: %s'%len(b_value))
        return

    elif b_value[1] > 1:
        log.warning('utf16InfoDecode: b_value=%s, unexpected byte at 1'%list(b_value))
        return

    ## now b_value[1] == 1
    size = 2 * binStrToInt(b_value[2:6])
    if tuple(b_value[6:8]) != (0, 0):
        log.warning('utf16InfoDecode: b_value=%s, null expected at 6:8'%list(b_value))
    if size != len(b_value)-8:
        log.warning('utf16InfoDecode: b_value=%s, size does not match'%list(b_value))

    return b_value[8:].decode('utf16')## str
Ejemplo n.º 2
0
def aboutInfoDecode(b_value):
    if not b_value:
        return
    aboutExt, _, aboutContents = b_value.partition('\x00')
    if not aboutExt:
        log.warning('read_type_3: about: no file extension')
        return
    return (aboutExt, aboutContents)
Ejemplo n.º 3
0
def aboutInfoDecode(b_value):
    if not b_value:
        return
    aboutExt, _, aboutContents = b_value.partition('\x00')
    if not aboutExt:
        log.warning('read_type_3: about: no file extension')
        return
    return (aboutExt, aboutContents)
Ejemplo n.º 4
0
def languageInfoDecode(b_value):
    """
		returns BabylonLanguage instance
	"""
    intValue = binStrToInt(b_value)
    try:
        return languageByCode[intValue]
    except IndexError:
        log.warning("read_type_3: unknown language code = %s", intValue)
        return
Ejemplo n.º 5
0
def languageInfoDecode(b_value):
	"""
		returns BabylonLanguage instance
	"""
	intValue = binStrToInt(b_value)
	try:
		return languageByCode[intValue]
	except IndexError:
		log.warning("read_type_3: unknown language code = %s" % intValue)
		return
Ejemplo n.º 6
0
def languageInfoDecode(b_value):
    """
		returns BabylonLanguage instance
	"""
    intValue = uintFromBytes(b_value)
    try:
        return languageByCode[intValue]
    except IndexError:
        log.warning(f"read_type_3: unknown language code = {intValue}")
        return
Ejemplo n.º 7
0
def aboutInfoDecode(b_value):
    if not b_value:
        return
    aboutExt, _, aboutContents = b_value.partition(b"\x00")
    if not aboutExt:
        log.warning("read_type_3: about: no file extension")
        return
    return {
        "about_extension": aboutExt,
        "about": aboutContents,
    }
Ejemplo n.º 8
0
def aboutInfoDecode(b_value):
	if not b_value:
		return
	aboutExt, _, aboutContents = b_value.partition(b"\x00")
	if not aboutExt:
		log.warning("read_type_3: about: no file extension")
		return
	return {
		"about_extension": aboutExt,
		"about": aboutContents,
	}
Ejemplo n.º 9
0
def utf16InfoDecode(b_value):
    """
		b_value is byte array
		returns str, or None (on errors)

		block type = 3
		block format: <2 byte code1><2 byte code2>
		if code2 == 0: then the block ends
		if code2 == 1: then the block continues as follows:
		<4 byte len1> \x00 \x00 <message in utf-16>
		len1 - length of message in 2-byte chars
	"""
    if b_value[0] != 0:
        log.warning(
            "utf16InfoDecode: b_value=%s, null expected at 0",
            b_value,
        )
        return

    if b_value[1] == 0:
        if len(b_value) > 2:
            log.warning(
                "utf16InfoDecode: unexpected b_value size: %s",
                len(b_value),
            )
        return

    elif b_value[1] > 1:
        log.warning(
            "utf16InfoDecode: b_value=%s, unexpected byte at 1",
            list(b_value),
        )
        return

    # now b_value[1] == 1
    size = 2 * binStrToInt(b_value[2:6])
    if tuple(b_value[6:8]) != (0, 0):
        log.warning(
            "utf16InfoDecode: b_value=%s, null expected at 6:8",
            list(b_value),
        )
    if size != len(b_value) - 8:
        log.warning(
            "utf16InfoDecode: b_value=%s, size does not match",
            list(b_value),
        )

    return b_value[8:].decode("utf16")  # str
Ejemplo n.º 10
0
def charsetInfoDecode(b_value):
    value = b_value[0]
    try:
        return charsetByCode[value]
    except KeyError:
        log.warning("read_type_3: unknown charset %s", value)
Ejemplo n.º 11
0
def write_entries(glos, f, cleanHTML, indexes):
    """
    :param indexes: str | None
    """
    if cleanHTML:
        BeautifulSoup = get_beautiful_soup()
        if not BeautifulSoup:
            log.warning(
                'cleanHTML option passed but BeautifulSoup not found.  ' +
                'to fix this run `sudo pip3 install lxml beautifulsoup4 html5lib`'
            )
    else:
        BeautifulSoup = None

    # write entries
    generate_id = id_generator()
    generate_indexes = indexes_generator(indexes)
    _buffer = ''

    xdxf.xdxf_init()

    glos.setDefaultDefiFormat('h')

    for i, entry in enumerate(glos):
        words = entry.getWords()
        word, alts = words[0], words[1:]
        defi = entry.getDefi()
        format = entry.getDefiFormat()
        
        long_title = _normalize.title_long(_normalize.title(word, BeautifulSoup))
        if not long_title:
            continue

        _id = next(generate_id)
        if BeautifulSoup:
            title_attr = BeautifulSoup.dammit.EntitySubstitution.substitute_xml(long_title, True)
        else:
            title_attr = '"%s"' % long_title

        begin_entry = '<d:entry id="%(id)s" d:title=%(title)s>\n' % {
            'id': _id,
            'title': title_attr,
        }

        if format == 'x':
            content = xdxf.xdxf_to_html(defi)
            content = format_clean_content(None, content, BeautifulSoup)
        else:
            content = defi
            content = format_clean_content(long_title, content, BeautifulSoup)

        indexes = generate_indexes(long_title, alts, content, BeautifulSoup)

        end_entry = '\n</d:entry>\n'

        _buffer += begin_entry
        _buffer += indexes
        _buffer += content
        _buffer += end_entry

        if i % 1000 == 0:
            f.write(_buffer)
            _buffer = ''
    f.write(_buffer)
Ejemplo n.º 12
0
def charsetInfoDecode(b_value):
	value = b_value[0]
	try:
		return charsetByCode[value]
	except KeyError:
		log.warning("read_type_3: unknown charset %s" % value)