Beispiel #1
0
def prepare_file_paths(string, extension):
    file_paths = []
    for prefix in string.split(","):
        file_path = add_extension(prefix, extension)
        if not os.path.exists(file_path):
            print_error("File '%s' does not exist" % file_path)
            return None
        file_paths.append(file_path)
    return file_paths
Beispiel #2
0
def prepare_file_paths(string, extension):
    file_paths = []
    for prefix in string.split(","):
        file_path = add_extension(prefix, extension)
        if not os.path.exists(file_path):
            print_error("File '%s' does not exist" % file_path)
            return None
        file_paths.append(file_path)
    return file_paths
Beispiel #3
0
    def read_ifo(ifo_path, has_syn, args):
        ifo_dict = {}
        ifo_obj = open(ifo_path, "rb")
        ifo_bytes = ifo_obj.read() # bytes
        ifo_unicode = ifo_bytes.decode("utf-8") # unicode, always utf-8 by spec
        ifo_obj.close()
        for line in ifo_unicode.splitlines():
            array = line.split("=")
            if len(array) >= 2:
                key = array[0]
                val = "=".join(array[1:])
                ifo_dict[key] = val
        
        if not "version" in ifo_dict:
            print_error("No 'version' found in the .ifo file (see StarDict spec)")
            return None
        if ifo_dict["version"] not in ["2.4.2", "3.0.0"]:
            print_error("The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)")
            return None

        required_keys = ["bookname", "wordcount", "idxfilesize"]
        if has_syn:
            required_keys.append("synwordcount")
        # TODO not used => disabling this
        #if ifo_dict["version"] == "3.0.0":
        #    required_keys.append("idxoffsetbits")
        for key in required_keys:
            if not key in ifo_dict:
                print_error("No '%s' found in the .ifo file (see StarDict spec)" % key)
                return None

        ifo_dict["wordcount"] = int(ifo_dict["wordcount"])
        ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"])
        if has_syn:
            ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"])
        # TODO not used => disabling this
        #if ifo_dict["version"] == "3.0.0":
        #    ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"])

        if args.sd_ignore_sametypesequence:
            print_debug("Ignoring sametypesequence value", args.debug)
        else:
            # TODO limitation: we require sametypesequence to be present
            if not "sametypesequence" in ifo_dict:
                print_error("The .ifo file must have a 'sametypesequence' value (see README).")
                return None
            # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES
            if not ifo_dict["sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES:
                print_error("The .ifo file must have a 'sametypesequence' value of %s (see README)." % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES))
                return None

        return ifo_dict
Beispiel #4
0
def check_arguments(args):
    """
    Check that we have all the required command line arguments,
    and that the input/output format values are supported.
    """
    for required in REQUIRED_PARAMETERS:
        if required not in args:
            print_error("Argument '%s' is required" % required)
            sys.exit(2)
    if args.input_format not in INPUT_FORMATS:
        print_error("Format '%s' is not a valid input format" % args.input_format)
        print_error("Valid input formats: %s" % INPUT_FORMATS)
        sys.exit(4)
    if args.output_format not in OUTPUT_FORMATS:
        print_error("Format '%s' is not a valid output format" % args.output_format)
        print_error("Valid output formats: %s" % OUTPUT_FORMATS)
        sys.exit(4)
Beispiel #5
0
def check_arguments(args):
    """
    Check that we have all the required command line arguments,
    and that the input/output format values are supported.
    """
    for required in REQUIRED_PARAMETERS:
        if required not in args:
            print_error("Argument '%s' is required" % required)
            sys.exit(2)
    if args.input_format not in INPUT_FORMATS:
        print_error("Format '%s' is not a valid input format" % args.input_format)
        print_error("Valid input formats: %s" % INPUT_FORMATS)
        sys.exit(4)
    if args.output_format not in OUTPUT_FORMATS:
        print_error("Format '%s' is not a valid output format" % args.output_format)
        print_error("Valid output formats: %s" % OUTPUT_FORMATS)
        sys.exit(4)
Beispiel #6
0
def write(dictionary, args, output_file_path):
    csv_fs = escape(args.csv_fs)
    csv_ls = escape(args.csv_ls)
    try:
        print_debug("Writing to file '%s'..." % (output_file_path), args.debug)
        output_file_obj = io.open(output_file_path, "wb")
        for index in dictionary.entries_index_sorted:
            entry = dictionary.entries[index]
            string = u"%s%s%s%s" % (entry.headword, csv_fs, entry.definition,
                                    csv_ls)
            output_file_obj.write(string.encode("utf-8"))
        output_file_obj.close()
        print_debug("Writing to file '%s'... success" % (output_file_path),
                    args.debug)
        return [output_file_path]
    except:
        print_error("Writing to file '%s'... failure" % (output_file_path))
        return None
Beispiel #7
0
def write(dictionary, args, output_file_path):
    try:
        print_debug("Writing to file '%s'..." % (output_file_path), args.debug)
        output_file_obj = open(output_file_path, "wb")
        for index in dictionary.entries_index_sorted:
            entry = dictionary.entries[index]
            string = u"%s%s%s%s" % (
                entry.headword,
                args.csv_fs,
                entry.definition,
                args.csv_ls
            )
            output_file_obj.write(string.encode("utf-8"))
        output_file_obj.close()
        print_debug("Writing to file '%s'... success" % (output_file_path), args.debug)
        return [output_file_path]
    except:
        print_error("Writing to file '%s'... failure" % (output_file_path))
        return None
Beispiel #8
0
def write(dictionary, args, output_file_path):
    try:
        print_debug("Creating XML tree...", args.debug)
        dictionary_elem = etree.Element("dictionary")
        for index in dictionary.entries_index_sorted:
            entry = dictionary.entries[index]
            entry_elem = etree.SubElement(dictionary_elem, "entry")
            key_elem = etree.SubElement(entry_elem, "key")
            key_elem.text = entry.headword
            def_elem = etree.SubElement(entry_elem, "def")
            def_elem.text = entry.definition
        tree = etree.ElementTree(dictionary_elem)
        print_debug("Creating XML tree... done", args.debug)
        print_debug("Writing to file '%s'..." % (output_file_path), args.debug)
        tree.write(output_file_path, pretty_print=True, xml_declaration=True)
        print_debug("Writing to file '%s'... success" % (output_file_path),
                    args.debug)
        return [output_file_path]
    except:
        print_error("Writing to file '%s'... failure" % (output_file_path))
        return None
Beispiel #9
0
def write(dictionary, args, output_file_path):
    try:
        print_debug("Creating XML tree...", args.debug)
        dictionary_elem = etree.Element("dictionary")
        for index in dictionary.entries_index_sorted:
            entry = dictionary.entries[index]
            entry_elem = etree.SubElement(dictionary_elem, "entry")
            key_elem = etree.SubElement(entry_elem, "key")
            key_elem.text = entry.headword
            def_elem = etree.SubElement(entry_elem, "def")
            def_elem.text = entry.definition
        tree = etree.ElementTree(dictionary_elem)
        print_debug("Creating XML tree... done", args.debug)
        print_debug("Writing to file '%s'..." % (output_file_path), args.debug)
        tree.write(
            output_file_path,
            pretty_print=True,
            xml_declaration=True
        )
        print_debug("Writing to file '%s'... success" % (output_file_path), args.debug)
        return [output_file_path]
    except:
        print_error("Writing to file '%s'... failure" % (output_file_path))
        return None
Beispiel #10
0
 def find_files(entries):
     found = {}
     for entry in entries:
         if entry.endswith(".ifo"):
             found["d.ifo"] = entry
             break
     if "d.ifo" not in found:
         print_error(
             "Cannot find .ifo file in the given StarDict file (see StarDict spec)"
         )
         return {}
     # remove .ifo extension
     base = found["d.ifo"][:-4]
     # attempt to find these ones
     tentative_idx = base + ".idx"
     tentative_idx_gz = base + ".idx.gz"
     tentative_dict = base + ".dict"
     tentative_dict_dz = base + ".dict.dz"
     tentative_dz = base + ".dz"
     if tentative_idx in entries:
         found["d.idx"] = tentative_idx
     if tentative_idx_gz in entries:
         found["d.idx.gz"] = tentative_idx_gz
     if not (("d.idx" in found) or ("d.idx.gz" in found)):
         print_error(
             "Cannot find .idx or .idx.gz file in the given StarDict file (see StarDict spec)"
         )
         return {}
     if tentative_dict in entries:
         found["d.dict"] = tentative_dict
     if tentative_dict_dz in entries:
         found["d.dict.dz"] = tentative_dict_dz
     if tentative_dz in entries:
         found["d.dz"] = tentative_dz
     if not (("d.dict" in found) or ("d.dict.dz" in found) or
             ("d.dz" in found)):
         print_error(
             "Cannot find .dict, .dict.dz, or .dz file in the given StarDict file (see StarDict spec)"
         )
         return {}
     # syn is optional
     tentative_syn = base + ".syn"
     if tentative_syn in entries:
         found["d.syn"] = tentative_syn
     return found
Beispiel #11
0
 def find_files(entries):
     found = {}
     for entry in entries:
         if entry.endswith(".ifo"):
             found["d.ifo"] = entry
             break
     if not "d.ifo" in found:
         print_error("Cannot find .ifo file in the given StarDict file (see StarDict spec)")
         return {}
     # remove .ifo extension
     base = found["d.ifo"][:-4]
     # attempt to find these ones
     tentative_idx = base + ".idx"
     tentative_idx_gz = base + ".idx.gz"
     tentative_dict = base + ".dict"
     tentative_dict_dz = base + ".dict.dz"
     tentative_dz = base + ".dz"
     if tentative_idx in entries:
         found["d.idx"] = tentative_idx
     if tentative_idx_gz in entries:
         found["d.idx.gz"] = tentative_idx_gz
     if not (("d.idx" in found) or ("d.idx.gz" in found)):
         print_error("Cannot find .idx or .idx.gz file in the given StarDict file (see StarDict spec)")
         return {}
     if tentative_dict in entries:
         found["d.dict"] = tentative_dict
     if tentative_dict_dz in entries:
         found["d.dict.dz"] = tentative_dict_dz
     if tentative_dz in entries:
         found["d.dz"] = tentative_dz
     if not (("d.dict" in found) or ("d.dict.dz" in found) or ("d.dz" in found)):
         print_error("Cannot find .dict, .dict.dz, or .dz file in the given StarDict file (see StarDict spec)")
         return {}
     # syn is optional
     tentative_syn = base + ".syn"
     if tentative_syn in entries:
         found["d.syn"] = tentative_syn
     return found
Beispiel #12
0
def read(dictionary, args, input_file_paths):
    print_error("Read function not implemented for EPUB dictionaries")
    return None
Beispiel #13
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # sort by headword, optionally ignoring case
    dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case)

    # create groups
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function_path=args.group_by_prefix_function,
        prefix_length=int(args.group_by_prefix_length),
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first)
    all_group_keys = group_keys
    if special_group is not None:
        all_group_keys += [u"SPECIAL"]

    # create mobi object
    mobi = DictionaryEbook(ebook_format=DictionaryEbook.MOBI, args=args)

    # add groups
    for key in all_group_keys:
        if key == u"SPECIAL":
            group_entries = special_group
        else:
            group_entries = group_dict[key]
        mobi.add_group(key, group_entries)

    # create output file
    print_debug("Writing to file '%s'..." % (output_file_path_absolute),
                args.debug)
    mobi.write(output_file_path_absolute, compress=False)
    result = [output_file_path]
    print_debug("Writing to file '%s'... done" % (output_file_path_absolute),
                args.debug)

    # run kindlegen
    tmp_path = mobi.get_tmp_path()
    if args.mobi_no_kindlegen:
        print_info("Not running kindlegen, the raw files are located in '%s'" %
                   tmp_path)
        result = [tmp_path]
    else:
        try:
            print_debug("Creating .mobi file with kindlegen...", args.debug)
            kindlegen_path = KINDLEGEN
            opf_file_path_absolute = os.path.join(tmp_path, "OEBPS",
                                                  "content.opf")
            mobi_file_path_relative = u"content.mobi"
            mobi_file_path_absolute = os.path.join(tmp_path, "OEBPS",
                                                   mobi_file_path_relative)
            if args.kindlegen_path is None:
                print_info("  Running '%s' from $PATH" % KINDLEGEN)
            else:
                kindlegen_path = args.kindlegen_path
                print_info("  Running '%s' from '%s'" %
                           (KINDLEGEN, kindlegen_path))
            proc = subprocess.Popen([
                kindlegen_path, opf_file_path_absolute, "-o",
                mobi_file_path_relative
            ],
                                    stdout=subprocess.PIPE,
                                    stdin=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            output = proc.communicate()
            if args.debug:
                output_unicode = (output[0]).decode("utf-8")
                print_debug(output_unicode, args.debug)
            copy_file(mobi_file_path_absolute, output_file_path_absolute)
            result = [output_file_path]
            print_debug("Creating .mobi file with kindlegen... done",
                        args.debug)
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" %
                        (KINDLEGEN, kindlegen_path))
            print_error("  Please make sure '%s':" % KINDLEGEN)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --kindlegen-path")

    # delete tmp directory
    tmp_path = mobi.get_tmp_path()
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        mobi.delete()
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #14
0
    def read_single_dict(dictionary, args, single_dict):
        # create tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        if len(single_dict) == 1:
            print_debug("Unzipping .install file...", args.debug)
            zip_file_path = single_dict[0]
            idx_file_path = os.path.join(tmp_path, "d.dict.idx")
            dict_file_path = os.path.join(tmp_path, "d.dict")
            zip_file_obj = zipfile.ZipFile(zip_file_path, "r")
            for entry in zip_file_obj.namelist():
                if entry.endswith(".dict.idx"):
                    zip_entry = zip_file_obj.open(entry)
                    idx_file_obj = open(idx_file_path, "wb")
                    idx_file_obj.write(zip_entry.read())
                    idx_file_obj.close()
                    zip_entry.close()
                elif entry.endswith(".dict"):
                    zip_entry = zip_file_obj.open(entry)
                    dict_file_obj = open(dict_file_path, "wb")
                    dict_file_obj.write(zip_entry.read())
                    dict_file_obj.close()
                    zip_entry.close()
            zip_file_obj.close()
            print_debug("Unzipping .install file... done", args.debug)
        else:
            print_debug("Files .dict.idx and .dict already uncompressed...", args.debug)
            idx_file_path = single_dict[0]
            dict_file_path = single_dict[1]
            for file_path in [idx_file_path, dict_file_path]:
                if not os.path.exists(file_path):
                    print_error("File '%s' does not exist" % file_path)
                    return False
            print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug)

        # unzip .dict file into tmp_path
        print_debug("Unzipping .dict file...", args.debug)
        zip_file_obj = zipfile.ZipFile(dict_file_path, "r")
        for entry in zip_file_obj.namelist():
            if not entry.endswith("/"):
                zip_entry = zip_file_obj.open(entry)
                entry_file_path = os.path.join(tmp_path, os.path.basename(entry))
                entry_file_obj = open(entry_file_path, "wb")
                entry_file_obj.write(zip_entry.read())
                entry_file_obj.close()
                zip_entry.close()
        zip_file_obj.close()
        print_debug("Unzipping .dict file... done", args.debug)

        # read .dict.idx
        print_debug("Reading .dict.idx file...", args.debug)
        sql_connection = sqlite3.connect(idx_file_path)
        sql_cursor = sql_connection.cursor()
        sql_cursor.execute("select * from T_DictIndex")
        index_data = sql_cursor.fetchall()
        chunk_index_to_entries = {}
        max_chunk_index = 1
        for index_entry in index_data:
            headword = index_entry[1]
            if args.ignore_case:
                headword = headword.lower()
            offset = index_entry[2]
            size = index_entry[3]
            chunk_index = index_entry[4]
            if not chunk_index in chunk_index_to_entries:
                chunk_index_to_entries[chunk_index] = []
            if chunk_index > max_chunk_index:
                max_chunk_index = chunk_index
            chunk_index_to_entries[chunk_index].append([headword, offset, size])
        sql_cursor.close()
        sql_connection.close()
        print_debug("Reading .dict.idx file... done", args.debug)

        # read c_* files
        print_debug("Reading c_* files...", args.debug)
        for chunk_index in range(1, max_chunk_index + 1):
            print_debug("  Reading c_%d file..." % (chunk_index), args.debug)
            chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index))
            chunk_file_obj = open(chunk_file_path, "rb")
            for entry in chunk_index_to_entries[chunk_index]:
                headword = entry[0]
                offset = entry[1]
                size = entry[2]
                chunk_file_obj.seek(offset)
                definition_bytes = chunk_file_obj.read(size)
                definition_unicode = definition_bytes.decode(args.input_file_encoding)
                dictionary.add_entry(headword=headword, definition=definition_unicode)
            chunk_file_obj.close()
            print_debug("  Reading c_%d file... done" % (chunk_index), args.debug)
        print_debug("Reading c_* files... done", args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)
        return True 
Beispiel #15
0
    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp file
        tmp_handler, tmp_path = create_temp_file()

        # copy the index file from the zip to the tmp file
        input_file_obj = zipfile.ZipFile(input_file_path)
        tmp_file_obj = io.open(tmp_path, "wb")
        tmp_file_obj.write(input_file_obj.read(WORDS_FILE_NAME))
        tmp_file_obj.close()
        input_file_obj.close()

        # read index with MARISA
        try:
            # call MARISA with marisa_trie module
            import marisa_trie
            trie = marisa_trie.Trie()
            trie.load(tmp_path)
            for pair in trie.items():
                dictionary.add_entry(headword=pair[0], definition=u"")
            result = True
        except ImportError as exc:
            # call MARISA with subprocess
            print_info("  MARISA cannot be imported as Python module. You might want to install it with:")
            print_info("  $ [sudo] pip install marisa_trie")
            marisa_reverse_lookup_path = MARISA_REVERSE_LOOKUP
            if args.marisa_bin_path is None:
                print_info("  Running '%s' from $PATH" % MARISA_REVERSE_LOOKUP)
            else:
                marisa_reverse_lookup_path = os.path.join(args.marisa_bin_path, MARISA_REVERSE_LOOKUP)
                print_info("  Running '%s' from '%s'" % (MARISA_REVERSE_LOOKUP, args.marisa_bin_path))
            # TODO this is ugly, but it works
            query = (u"\n".join([str(x) for x in range(int(args.marisa_index_size))]) + u"\n").encode("utf-8")

            try:
                proc = subprocess.Popen(
                    [marisa_reverse_lookup_path, tmp_path],
                    stdout=subprocess.PIPE,
                    stdin=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
                stdout = proc.communicate(input=query)[0].decode("utf-8")
                for line in stdout.splitlines():
                    array = line.split("\t")
                    if len(array) >= 2:
                        key = array[1]
                        if args.ignore_case:
                            key = key.lower()
                        dictionary.add_entry(headword=key, definition=u"")
                result = True
            except OSError as exc:
                print_error("  Unable to run '%s' as '%s'" % (MARISA_REVERSE_LOOKUP, marisa_reverse_lookup_path))
                print_error("  Please make sure '%s':" % MARISA_REVERSE_LOOKUP)
                print_error("    1. is available on your $PATH or")
                print_error("    2. specify its path with --marisa-bin-path or")
                print_error("    3. install the marisa_trie Python module")
        except:
            print_debug("Reading from file '%s'... failed" % (input_file_path))

        # delete the tmp file
        delete_file(tmp_handler, tmp_path)
        return result
Beispiel #16
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # sort by headword
    dictionary.sort(by_headword=True)

    # group by prefix
    files_to_compress = []
    prefix_length = int(args.group_by_prefix_length)
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function=get_prefix_kobo,
        prefix_length=prefix_length,
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first
    )
    if special_group is not None:
        special_group_key = u"1" * prefix_length
        group_dict[special_group_key] = special_group
        group_keys = [special_group_key] + group_keys

    # write files
    for key in group_keys:
        # write html file
        file_html_path = key + u".html"
        file_html_obj = io.open(file_html_path, "wb")
        file_html_obj.write(u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode("utf-8"))
        for entry in group_dict[key]:
            headword = entry.headword
            definition = entry.definition
            file_html_obj.write((u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" % (headword, headword, definition)).encode("utf-8"))
        file_html_obj.write((u"</html>").encode("utf-8"))
        file_html_obj.close()

        # compress in gz format
        file_html_obj = io.open(file_html_path, "rb")
        file_gz_path = file_html_path + u".gz"
        file_gz_obj = gzip.open(file_gz_path, "wb")
        file_gz_obj.writelines(file_html_obj)
        file_gz_obj.close()
        file_html_obj.close()

        # delete .html file
        delete_file(None, file_html_path)
        # rename .html.gz file into .html
        rename_file(file_gz_path, file_html_path)
        files_to_compress.append(file_html_path)

    # write words
    file_words_path = WORDS_FILE_NAME
    keys = sorted(dictionary.entries_index.keys())
    try:
        import marisa_trie
        trie = marisa_trie.Trie(keys)
        trie.save(file_words_path)
        result = [file_words_path]
    except ImportError as exc:
        # call MARISA with subprocess
        print_info("  MARISA cannot be imported as Python module. You might want to install it with:")
        print_info("  $ [sudo] pip install marisa_trie")
        marisa_build_path = MARISA_BUILD
        if args.marisa_bin_path is None:
            print_info("  Running '%s' from $PATH" % MARISA_BUILD)
        else:
            marisa_build_path = os.path.join(args.marisa_bin_path, MARISA_BUILD)
            print_info("  Running '%s' from '%s'" % (MARISA_BUILD, args.marisa_bin_path))
        # TODO this is ugly, but it works
        query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8")

        try:
            proc = subprocess.Popen(
                [marisa_build_path, "-l", "-o", file_words_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            proc.communicate(input=query)[0].decode("utf-8")
            result = [file_words_path]
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" % (MARISA_BUILD, marisa_build_path))
            print_error("  Please make sure '%s':" % MARISA_BUILD)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --marisa-bin-path or")
            print_error("    3. install the marisa_trie Python module")
            result = None

    if result is not None:
        # add file_words_path to files to compress
        files_to_compress.append(file_words_path)
        # create output zip file
        try:
            print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" % (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #17
0
def read(dictionary, args, input_file_paths):
    def find_files(entries):
        found = {}
        for entry in entries:
            if entry.endswith(".ifo"):
                found["d.ifo"] = entry
                break
        if "d.ifo" not in found:
            print_error(
                "Cannot find .ifo file in the given StarDict file (see StarDict spec)"
            )
            return {}
        # remove .ifo extension
        base = found["d.ifo"][:-4]
        # attempt to find these ones
        tentative_idx = base + ".idx"
        tentative_idx_gz = base + ".idx.gz"
        tentative_dict = base + ".dict"
        tentative_dict_dz = base + ".dict.dz"
        tentative_dz = base + ".dz"
        if tentative_idx in entries:
            found["d.idx"] = tentative_idx
        if tentative_idx_gz in entries:
            found["d.idx.gz"] = tentative_idx_gz
        if not (("d.idx" in found) or ("d.idx.gz" in found)):
            print_error(
                "Cannot find .idx or .idx.gz file in the given StarDict file (see StarDict spec)"
            )
            return {}
        if tentative_dict in entries:
            found["d.dict"] = tentative_dict
        if tentative_dict_dz in entries:
            found["d.dict.dz"] = tentative_dict_dz
        if tentative_dz in entries:
            found["d.dz"] = tentative_dz
        if not (("d.dict" in found) or ("d.dict.dz" in found) or
                ("d.dz" in found)):
            print_error(
                "Cannot find .dict, .dict.dz, or .dz file in the given StarDict file (see StarDict spec)"
            )
            return {}
        # syn is optional
        tentative_syn = base + ".syn"
        if tentative_syn in entries:
            found["d.syn"] = tentative_syn
        return found

    def uncompress_file(compressed_path, tmp_path, key):
        uncompressed_path = os.path.join(tmp_path, key)
        u_obj = io.open(uncompressed_path, "wb")
        c_obj = gzip.open(compressed_path, "rb")
        u_obj.write(c_obj.read())
        c_obj.close()
        u_obj.close()
        print_debug("Uncompressed %s" % (uncompressed_path), args.debug)
        return uncompressed_path

    def read_ifo(ifo_path, has_syn, args):
        ifo_dict = {}
        ifo_obj = io.open(ifo_path, "rb")
        ifo_bytes = ifo_obj.read()  # bytes
        ifo_unicode = ifo_bytes.decode(
            "utf-8")  # unicode, always utf-8 by spec
        ifo_obj.close()
        for line in ifo_unicode.splitlines():
            array = line.split("=")
            if len(array) >= 2:
                key = array[0]
                val = "=".join(array[1:])
                ifo_dict[key] = val

        if "version" not in ifo_dict:
            print_error(
                "No 'version' found in the .ifo file (see StarDict spec)")
            return None
        if ifo_dict["version"] not in ["2.4.2", "3.0.0"]:
            print_error(
                "The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)"
            )
            return None

        required_keys = ["bookname", "wordcount", "idxfilesize"]
        if has_syn:
            required_keys.append("synwordcount")
        # TODO not used => disabling this
        # if ifo_dict["version"] == "3.0.0":
        #     required_keys.append("idxoffsetbits")
        for key in required_keys:
            if key not in ifo_dict:
                print_error(
                    "No '%s' found in the .ifo file (see StarDict spec)" % key)
                return None

        ifo_dict["wordcount"] = int(ifo_dict["wordcount"])
        ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"])
        if has_syn:
            ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"])
        # TODO not used => disabling this
        # if ifo_dict["version"] == "3.0.0":
        #     ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"])

        if args.sd_ignore_sametypesequence:
            print_debug("Ignoring sametypesequence value", args.debug)
        else:
            # TODO limitation: we require sametypesequence to be present
            if "sametypesequence" not in ifo_dict:
                print_error(
                    "The .ifo file must have a 'sametypesequence' value (see README)."
                )
                return None
            # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES
            if not ifo_dict[
                    "sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES:
                print_error(
                    "The .ifo file must have a 'sametypesequence' value of %s (see README)."
                    % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES))
                return None

        return ifo_dict

    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        # find .ifo, .idx, .dict[.dz] and .syn files inside the zip
        # and extract them to tmp_path
        input_file_obj = zipfile.ZipFile(input_file_path)
        found_files = find_files(input_file_obj.namelist())
        extracted_files = {}
        if len(found_files) > 0:
            for key in found_files:
                entry = found_files[key]
                ext_file_path = os.path.join(tmp_path, key)
                ext_file_obj = io.open(ext_file_path, "wb")
                zip_entry = input_file_obj.open(entry)
                ext_file_obj.write(zip_entry.read())
                zip_entry.close()
                ext_file_obj.close()
                print_debug("Extracted %s" % (ext_file_path), args.debug)
                extracted_files[key] = ext_file_path
                # extract from compressed file, but only if ".idx" is not present as well
                if (key == "d.idx.gz") and ("d.idx" not in found_files):
                    extracted_files["d.idx"] = uncompress_file(
                        ext_file_path, tmp_path, "d.idx")
                # extract from compressed file, but only if ".dict" is not present as well
                if ((key == "d.dict.dz") or
                    (key == "d.dz")) and ("d.dict" not in found_files):
                    extracted_files["d.dict"] = uncompress_file(
                        ext_file_path, tmp_path, "d.dict")
        input_file_obj.close()

        # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn

        has_syn = "d.syn" in extracted_files
        if (has_syn) and (args.ignore_synonyms):
            has_syn = False
            print_debug(
                "Dictionary has synonyms, but ignoring them (--ignore-synonym)",
                args.debug)
        ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args)
        print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)),
                    args.debug)

        # read dict file
        dict_file_obj = io.open(extracted_files["d.dict"], "rb")
        dict_file_bytes = dict_file_obj.read()
        dict_file_obj.close()

        # read idx file
        idx_file_obj = io.open(extracted_files["d.idx"], "rb")
        byte_read = idx_file_obj.read(1)
        headword = b""
        while byte_read:
            if byte_read == b"\0":
                # end of current word: read offset and size
                offset_bytes = idx_file_obj.read(4)
                offset_int = int((struct.unpack('>i', offset_bytes))[0])
                size_bytes = idx_file_obj.read(4)
                size_int = int((struct.unpack('>i', size_bytes))[0])
                definition = dict_file_bytes[offset_int:(
                    offset_int + size_int)].decode(args.input_file_encoding)
                headword = headword.decode("utf-8")
                if args.ignore_case:
                    headword = headword.lower()
                dictionary.add_entry(headword=headword, definition=definition)
                headword = b""
            else:
                # read next byte
                headword += byte_read
            byte_read = idx_file_obj.read(1)
        idx_file_obj.close()
        result = True

        # read syn file, if present
        if has_syn:
            print_debug(
                "The input StarDict file contains a .syn file, parsing it...",
                args.debug)
            result = False
            syn_file_obj = io.open(extracted_files["d.syn"], "rb")
            byte_read = syn_file_obj.read(1)
            synonym = b""
            while byte_read:
                if byte_read == b"\0":
                    # end of current synonym: read index of original word
                    index_bytes = syn_file_obj.read(4)
                    index_int = int((struct.unpack('>i', index_bytes))[0])
                    synonym = synonym.decode("utf-8")
                    if index_int < len(dictionary):
                        dictionary.add_synonym(synonym=synonym,
                                               headword_index=index_int)
                    else:
                        # emit a warning?
                        print_debug(
                            "Synonym '%s' points to index %d >= len(dictionary), skipping it"
                            % (index_int, synonym), args.debug)
                    synonym = b""
                else:
                    # read next byte
                    synonym += byte_read
                byte_read = syn_file_obj.read(1)
            syn_file_obj.close()
            result = True
            print_debug(
                "The input StarDict file contains a .syn file, parsing it... done",
                args.debug)
        else:
            print_debug("The input StarDict file does not contain a .syn file",
                        args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

        return result

    for input_file_path in input_file_paths:
        print_debug("Reading from file '%s'..." % (input_file_path),
                    args.debug)
        result = read_single_file(dictionary, args, input_file_path)
        if result:
            print_debug(
                "Reading from file '%s'... success" % (input_file_path),
                args.debug)
        else:
            print_error("Reading from file '%s'... failed" % (input_file_path))
            return None
    return dictionary
Beispiel #18
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # get absolute path for collation function file 
    bookeen_collation_function_path = None
    if args.bookeen_collation_function is not None:
        bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]

    # copy empty.idx into tmp_path
    idx_file_path = base + u".dict.idx"
    dict_file_path = base + u".dict"
    copy_file(EMPTY_FILE_PATH, idx_file_path)

    # open index
    sql_connection = sqlite3.connect(idx_file_path)

    # install collation in the index
    collation_function = collate_function_default
    if bookeen_collation_function_path is not None:
        try:
            collation_function = imp.load_source("", bookeen_collation_function_path).collate_function
            print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug)
        except:
            print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path))
    sql_connection.create_collation("IcuNoCase", collation_function)
    sql_connection.text_factory = str

    # get a cursor and delete any data from the index file
    sql_cursor = sql_connection.cursor()
    sql_cursor.execute("delete from T_DictIndex")

    # write c_* files
    # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively)
    print_debug("Writing c_* files...", args.debug)
    files_to_compress = []
    current_offset = 0
    chunk_index = 1
    chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
    files_to_compress.append(chunk_file_path)
    chunk_file_obj = open(chunk_file_path, "wb")
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        chunk_file_obj.write(definition_bytes)
        # insert headword into index file
        sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index)
        sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # insert synonyms into index file
        if not args.ignore_synonyms:
            for synonym in entry.get_synonyms():
                sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index)
                sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # update offset
        current_offset += definition_size
        # if we reached CHUNK_SIZE, open the next c_* file
        if current_offset > CHUNK_SIZE:
            chunk_file_obj.close()
            chunk_index += 1
            chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
            files_to_compress.append(chunk_file_path)
            chunk_file_obj = open(chunk_file_path, "wb")
            current_offset = 0
    chunk_file_obj.close()
    print_debug("Writing c_* files... done", args.debug)

    # compress
    print_debug("Compressing c_* files...", args.debug)
    file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED)
    for file_to_compress in files_to_compress:
        file_to_compress = os.path.basename(file_to_compress)
        file_zip_obj.write(file_to_compress)
    file_zip_obj.close()
    print_debug("Compressing c_* files... done", args.debug)

    # update index metadata
    print_debug("Updating index metadata...", args.debug)
    header = HEADER % (args.language_from)
    sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,))
    sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,))
    sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,))
    sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,))
    sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,))
    sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,))
    sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,))
    sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,))
    # the meaning of the following is unknown 
    sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",))
    sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",))
    sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",))
    sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",))
    print_debug("Updating index metadata... done", args.debug)

    # compact and close
    sql_cursor.execute("vacuum")
    sql_cursor.close()
    sql_connection.close()

    # create .install file or copy .dict.idx and .dict into requested output directory
    parent_output_directory = os.path.split(output_file_path_absolute)[0]
    if args.bookeen_install_file:
        print_debug("Creating .install file...", args.debug)
        file_zip_path = os.path.join(parent_output_directory, base + u".install")
        file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED)
        for file_to_compress in [dict_file_path, idx_file_path]:
            file_to_compress = os.path.basename(file_to_compress)
            file_zip_obj.write(file_to_compress)
        file_zip_obj.close()
        result = [file_zip_path]
        print_debug("Creating .install file... done", args.debug)
    else:
        print_debug("Copying .dict.idx and .dict files...", args.debug)
        dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path))
        idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path))
        copy_file(dict_file_path, dict_file_path_final)
        copy_file(idx_file_path, idx_file_path_final)
        result = [idx_file_path_final, dict_file_path_final]
        print_debug("Copying .dict.idx and .dict files... done", args.debug)

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #19
0
def read(dictionary, args, input_file_paths):
    print_error("Read function not implemented for EPUB dictionaries")
    return None
Beispiel #20
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # get absolute path for collation function file
    bookeen_collation_function_path = None
    if args.bookeen_collation_function is not None:
        bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]

    # copy empty.idx into tmp_path
    idx_file_path = base + u".dict.idx"
    dict_file_path = base + u".dict"
    copy_file(EMPTY_FILE_PATH, idx_file_path)

    # open index
    sql_connection = sqlite3.connect(idx_file_path)

    # install collation in the index
    collation_function = collate_function_default
    if bookeen_collation_function_path is not None:
        try:
            collation_function = imp.load_source("", bookeen_collation_function_path).collate_function
            print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug)
        except:
            print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path))
    sql_connection.create_collation("IcuNoCase", collation_function)
    sql_connection.text_factory = str

    # get a cursor and delete any data from the index file
    sql_cursor = sql_connection.cursor()
    sql_cursor.execute("delete from T_DictIndex")

    # write c_* files
    # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively)
    print_debug("Writing c_* files...", args.debug)
    files_to_compress = []
    current_offset = 0
    chunk_index = 1
    chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
    files_to_compress.append(chunk_file_path)
    chunk_file_obj = io.open(chunk_file_path, "wb")
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        chunk_file_obj.write(definition_bytes)
        # insert headword into index file
        sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index)
        sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # insert synonyms into index file
        if not args.ignore_synonyms:
            for synonym in entry.get_synonyms():
                sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index)
                sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple)
        # update offset
        current_offset += definition_size
        # if we reached CHUNK_SIZE, open the next c_* file
        if current_offset > CHUNK_SIZE:
            chunk_file_obj.close()
            chunk_index += 1
            chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)
            files_to_compress.append(chunk_file_path)
            chunk_file_obj = io.open(chunk_file_path, "wb")
            current_offset = 0
    chunk_file_obj.close()
    print_debug("Writing c_* files... done", args.debug)

    # compress
    print_debug("Compressing c_* files...", args.debug)
    file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED)
    for file_to_compress in files_to_compress:
        file_to_compress = os.path.basename(file_to_compress)
        file_zip_obj.write(file_to_compress)
    file_zip_obj.close()
    print_debug("Compressing c_* files... done", args.debug)

    # update index metadata
    print_debug("Updating index metadata...", args.debug)
    header = HEADER % (args.language_from)
    sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,))
    sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,))
    sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,))
    sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,))
    sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,))
    sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,))
    sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,))
    sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,))
    # the meaning of the following is unknown
    sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",))
    sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",))
    sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",))
    sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",))
    print_debug("Updating index metadata... done", args.debug)

    # compact and close
    sql_cursor.execute("vacuum")
    sql_cursor.close()
    sql_connection.close()

    # create .install file or copy .dict.idx and .dict into requested output directory
    parent_output_directory = os.path.split(output_file_path_absolute)[0]
    if args.bookeen_install_file:
        print_debug("Creating .install file...", args.debug)
        file_zip_path = os.path.join(parent_output_directory, base + u".install")
        file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED)
        for file_to_compress in [dict_file_path, idx_file_path]:
            file_to_compress = os.path.basename(file_to_compress)
            file_zip_obj.write(file_to_compress)
        file_zip_obj.close()
        result = [file_zip_path]
        print_debug("Creating .install file... done", args.debug)
    else:
        print_debug("Copying .dict.idx and .dict files...", args.debug)
        dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path))
        idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path))
        copy_file(dict_file_path, dict_file_path_final)
        copy_file(idx_file_path, idx_file_path_final)
        result = [idx_file_path_final, dict_file_path_final]
        print_debug("Copying .dict.idx and .dict files... done", args.debug)

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #21
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # sort by headword, optionally ignoring case
    dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case)

    # create groups
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function_path=args.group_by_prefix_function,
        prefix_length=int(args.group_by_prefix_length),
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first
    )
    all_group_keys = group_keys
    if special_group is not None:
        all_group_keys += [u"SPECIAL"]

    # create mobi object
    mobi = DictionaryEbook(ebook_format=DictionaryEbook.MOBI, args=args)

    # add groups
    for key in all_group_keys:
        if key == u"SPECIAL":
            group_entries = special_group
        else:
            group_entries = group_dict[key]
        mobi.add_group(key, group_entries)

    # create output file
    print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug)
    mobi.write(output_file_path_absolute, compress=False)
    result = [output_file_path]
    print_debug("Writing to file '%s'... done" % (output_file_path_absolute), args.debug)

    # run kindlegen
    tmp_path = mobi.get_tmp_path()
    if args.mobi_no_kindlegen:
        print_info("Not running kindlegen, the raw files are located in '%s'" % tmp_path)
        result = [tmp_path]
    else:
        try:
            print_debug("Creating .mobi file with kindlegen...", args.debug)
            kindlegen_path = KINDLEGEN
            opf_file_path_absolute = os.path.join(tmp_path, "OEBPS", "content.opf")
            mobi_file_path_relative = u"content.mobi"
            mobi_file_path_absolute = os.path.join(tmp_path, "OEBPS", mobi_file_path_relative)
            if args.kindlegen_path is None:
                print_info("  Running '%s' from $PATH" % KINDLEGEN)
            else:
                kindlegen_path = args.kindlegen_path
                print_info("  Running '%s' from '%s'" % (KINDLEGEN, kindlegen_path))
            proc = subprocess.Popen(
                [kindlegen_path, opf_file_path_absolute, "-o", mobi_file_path_relative],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            output = proc.communicate()
            if args.debug:
                output_unicode = (output[0]).decode("utf-8")
                print_debug(output_unicode, args.debug)
            copy_file(mobi_file_path_absolute, output_file_path_absolute)
            result = [output_file_path]
            print_debug("Creating .mobi file with kindlegen... done", args.debug)
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" % (KINDLEGEN, kindlegen_path))
            print_error("  Please make sure '%s':" % KINDLEGEN)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --kindlegen-path")

    # delete tmp directory
    tmp_path = mobi.get_tmp_path()
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        mobi.delete()
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #22
0
    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp file
        tmp_handler, tmp_path = create_temp_file()

        # copy the index file from the zip to the tmp file
        input_file_obj = zipfile.ZipFile(input_file_path)
        tmp_file_obj = io.open(tmp_path, "wb")
        tmp_file_obj.write(input_file_obj.read(WORDS_FILE_NAME))
        tmp_file_obj.close()
        input_file_obj.close()

        # read index with MARISA
        try:
            # call MARISA with marisa_trie module
            import marisa_trie
            trie = marisa_trie.Trie()
            trie.load(tmp_path)
            for pair in trie.items():
                dictionary.add_entry(headword=pair[0], definition=u"")
            result = True
        except ImportError as exc:
            # call MARISA with subprocess
            print_info(
                "  MARISA cannot be imported as Python module. You might want to install it with:"
            )
            print_info("  $ [sudo] pip install marisa_trie")
            marisa_reverse_lookup_path = MARISA_REVERSE_LOOKUP
            if args.marisa_bin_path is None:
                print_info("  Running '%s' from $PATH" % MARISA_REVERSE_LOOKUP)
            else:
                marisa_reverse_lookup_path = os.path.join(
                    args.marisa_bin_path, MARISA_REVERSE_LOOKUP)
                print_info("  Running '%s' from '%s'" %
                           (MARISA_REVERSE_LOOKUP, args.marisa_bin_path))
            # TODO this is ugly, but it works
            query = (u"\n".join(
                [str(x) for x in range(int(args.marisa_index_size))]) +
                     u"\n").encode("utf-8")

            try:
                proc = subprocess.Popen([marisa_reverse_lookup_path, tmp_path],
                                        stdout=subprocess.PIPE,
                                        stdin=subprocess.PIPE,
                                        stderr=subprocess.PIPE)
                stdout = proc.communicate(input=query)[0].decode("utf-8")
                for line in stdout.splitlines():
                    array = line.split("\t")
                    if len(array) >= 2:
                        key = array[1]
                        if args.ignore_case:
                            key = key.lower()
                        dictionary.add_entry(headword=key, definition=u"")
                result = True
            except OSError as exc:
                print_error(
                    "  Unable to run '%s' as '%s'" %
                    (MARISA_REVERSE_LOOKUP, marisa_reverse_lookup_path))
                print_error("  Please make sure '%s':" % MARISA_REVERSE_LOOKUP)
                print_error("    1. is available on your $PATH or")
                print_error(
                    "    2. specify its path with --marisa-bin-path or")
                print_error("    3. install the marisa_trie Python module")
        except:
            print_debug("Reading from file '%s'... failed" % (input_file_path))

        # delete the tmp file
        delete_file(tmp_handler, tmp_path)
        return result
Beispiel #23
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # sort by headword
    dictionary.sort(by_headword=True)

    # group by prefix
    files_to_compress = []
    prefix_length = int(args.group_by_prefix_length)
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function=get_prefix_kobo,
        prefix_length=prefix_length,
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first)
    if special_group is not None:
        special_group_key = u"1" * prefix_length
        group_dict[special_group_key] = special_group
        group_keys = [special_group_key] + group_keys

    # write files
    for key in group_keys:
        # write html file
        file_html_path = key + u".html"
        file_html_obj = io.open(file_html_path, "wb")
        file_html_obj.write(
            u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode(
                "utf-8"))
        for entry in group_dict[key]:
            headword = entry.headword
            definition = entry.definition
            file_html_obj.write(
                (u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" %
                 (headword, headword, definition)).encode("utf-8"))
        file_html_obj.write((u"</html>").encode("utf-8"))
        file_html_obj.close()

        # compress in gz format
        file_html_obj = io.open(file_html_path, "rb")
        file_gz_path = file_html_path + u".gz"
        file_gz_obj = gzip.open(file_gz_path, "wb")
        file_gz_obj.writelines(file_html_obj)
        file_gz_obj.close()
        file_html_obj.close()

        # delete .html file
        delete_file(None, file_html_path)
        # rename .html.gz file into .html
        rename_file(file_gz_path, file_html_path)
        files_to_compress.append(file_html_path)

    # write words
    file_words_path = WORDS_FILE_NAME
    keys = sorted(dictionary.entries_index.keys())
    try:
        import marisa_trie
        trie = marisa_trie.Trie(keys)
        trie.save(file_words_path)
        result = [file_words_path]
    except ImportError as exc:
        # call MARISA with subprocess
        print_info(
            "  MARISA cannot be imported as Python module. You might want to install it with:"
        )
        print_info("  $ [sudo] pip install marisa_trie")
        marisa_build_path = MARISA_BUILD
        if args.marisa_bin_path is None:
            print_info("  Running '%s' from $PATH" % MARISA_BUILD)
        else:
            marisa_build_path = os.path.join(args.marisa_bin_path,
                                             MARISA_BUILD)
            print_info("  Running '%s' from '%s'" %
                       (MARISA_BUILD, args.marisa_bin_path))
        # TODO this is ugly, but it works
        query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8")

        try:
            proc = subprocess.Popen(
                [marisa_build_path, "-l", "-o", file_words_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE)
            proc.communicate(input=query)[0].decode("utf-8")
            result = [file_words_path]
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" %
                        (MARISA_BUILD, marisa_build_path))
            print_error("  Please make sure '%s':" % MARISA_BUILD)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --marisa-bin-path or")
            print_error("    3. install the marisa_trie Python module")
            result = None

    if result is not None:
        # add file_words_path to files to compress
        files_to_compress.append(file_words_path)
        # create output zip file
        try:
            print_debug(
                "Writing to file '%s'..." % (output_file_path_absolute),
                args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w",
                                           zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug(
                "Writing to file '%s'... success" %
                (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" %
                        (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #24
0
def main():
    parser = argparse.ArgumentParser(
        usage=USAGE,
        description=DESCRIPTION,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    for param in COMMAND_LINE_PARAMETERS:
        if param["short"] is None:
            parser.add_argument(
                param["long"],
                help=param["help"],
                action=param["action"],
                default=argparse.SUPPRESS
            )
        else:
            parser.add_argument(
                param["short"],
                param["long"],
                help=param["help"],
                action=param["action"],
                default=argparse.SUPPRESS
            )
    arguments = parser.parse_args()

    # no arguments: show help and exit
    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit(0)

    # print version and exit
    if "version" in arguments:
        print_info("Penelope v%s" % (__version__))
        sys.exit(0)

    # check we have all the required arguments
    # if not, it will sys.exit() with some error code
    check_arguments(arguments)

    # set default values
    set_default_values(arguments)
    print_debug(u"Running with the command line arguments:\n%s" % (str(arguments)), arguments.debug)

    # read raw dictionary
    print_info(u"Reading input file(s)...")
    dictionary = read_dictionary(arguments)
    if dictionary is None:
        print_error("Unable to read the input file(s)")
        sys.exit(8)
    print_info(u"Reading input file(s)... done")

    # apply custom input parser, if specified
    if arguments.input_parser is not None:
        input_parser = load_input_parser(arguments.input_parser)
        if input_parser is not None:
            print_info(u"Applying the specified input parser...")
            dictionary = input_parser.parse(dictionary, arguments)
            print_info(u"Applying the specified input parser... done")

    # sort dictionary before, if requested
    if arguments.sort_before:
        print_info(u"Sorting before...")
        dictionary.sort(
            arguments.sort_by_headword,
            arguments.sort_by_definition,
            arguments.sort_reverse,
            arguments.sort_ignore_case
        )
        print_info(u"Sorting before... done")

    # merge definitions, if requested
    if arguments.merge_definitions:
        print_info(u"Merging...")
        dictionary.merge_definitions(merge_separator=arguments.merge_separator)
        print_info(u"Merging... done")

    # flatten synonyms, if requested
    if arguments.flatten_synonyms:
        print_info(u"Flattening synonyms...")
        dictionary.flatten_synonyms()
        print_info(u"Flattening synonyms... done")

    # sort dictionary after, if requested
    if arguments.sort_after:
        print_info(u"Sorting after...")
        dictionary.sort(
            arguments.sort_by_headword,
            arguments.sort_by_definition,
            arguments.sort_reverse,
            arguments.sort_ignore_case
        )
        print_info(u"Sorting after... done")

    # output dictionary
    print_info(u"Writing output file(s)...")
    output_paths = write_dictionary(dictionary, arguments)
    if output_paths is None:
        print_error("Unable to write the output file(s)")
        sys.exit(16)
    print_info(u"Writing output file(s)... done")
    print_info(u"The following file(s) have been created:")
    for op in output_paths:
        print_info(u"  %s" % op)

    sys.exit(0)
Beispiel #25
0
    def read_single_dict(dictionary, args, single_dict):
        # create tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        if len(single_dict) == 1:
            print_debug("Unzipping .install file...", args.debug)
            zip_file_path = single_dict[0]
            idx_file_path = os.path.join(tmp_path, "d.dict.idx")
            dict_file_path = os.path.join(tmp_path, "d.dict")
            zip_file_obj = zipfile.ZipFile(zip_file_path, "r")
            for entry in zip_file_obj.namelist():
                if entry.endswith(".dict.idx"):
                    zip_entry = zip_file_obj.open(entry)
                    idx_file_obj = io.open(idx_file_path, "wb")
                    idx_file_obj.write(zip_entry.read())
                    idx_file_obj.close()
                    zip_entry.close()
                elif entry.endswith(".dict"):
                    zip_entry = zip_file_obj.open(entry)
                    dict_file_obj = io.open(dict_file_path, "wb")
                    dict_file_obj.write(zip_entry.read())
                    dict_file_obj.close()
                    zip_entry.close()
            zip_file_obj.close()
            print_debug("Unzipping .install file... done", args.debug)
        else:
            print_debug("Files .dict.idx and .dict already uncompressed...", args.debug)
            idx_file_path = single_dict[0]
            dict_file_path = single_dict[1]
            for file_path in [idx_file_path, dict_file_path]:
                if not os.path.exists(file_path):
                    print_error("File '%s' does not exist" % file_path)
                    return False
            print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug)

        # unzip .dict file into tmp_path
        print_debug("Unzipping .dict file...", args.debug)
        zip_file_obj = zipfile.ZipFile(dict_file_path, "r")
        for entry in zip_file_obj.namelist():
            if not entry.endswith("/"):
                zip_entry = zip_file_obj.open(entry)
                entry_file_path = os.path.join(tmp_path, os.path.basename(entry))
                entry_file_obj = io.open(entry_file_path, "wb")
                entry_file_obj.write(zip_entry.read())
                entry_file_obj.close()
                zip_entry.close()
        zip_file_obj.close()
        print_debug("Unzipping .dict file... done", args.debug)

        # read .dict.idx
        print_debug("Reading .dict.idx file...", args.debug)
        sql_connection = sqlite3.connect(idx_file_path)
        sql_cursor = sql_connection.cursor()
        sql_cursor.execute("select * from T_DictIndex")
        index_data = sql_cursor.fetchall()
        chunk_index_to_entries = {}
        max_chunk_index = 1
        for index_entry in index_data:
            headword = index_entry[1]
            if args.ignore_case:
                headword = headword.lower()
            offset = index_entry[2]
            size = index_entry[3]
            chunk_index = index_entry[4]
            if chunk_index not in chunk_index_to_entries:
                chunk_index_to_entries[chunk_index] = []
            if chunk_index > max_chunk_index:
                max_chunk_index = chunk_index
            chunk_index_to_entries[chunk_index].append([headword, offset, size])
        sql_cursor.close()
        sql_connection.close()
        print_debug("Reading .dict.idx file... done", args.debug)

        # read c_* files
        print_debug("Reading c_* files...", args.debug)
        for chunk_index in range(1, max_chunk_index + 1):
            print_debug("  Reading c_%d file..." % (chunk_index), args.debug)
            chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index))
            chunk_file_obj = io.open(chunk_file_path, "rb")
            for entry in chunk_index_to_entries[chunk_index]:
                headword = entry[0]
                offset = entry[1]
                size = entry[2]
                chunk_file_obj.seek(offset)
                definition_bytes = chunk_file_obj.read(size)
                definition_unicode = definition_bytes.decode(args.input_file_encoding)
                dictionary.add_entry(headword=headword, definition=definition_unicode)
            chunk_file_obj.close()
            print_debug("  Reading c_%d file... done" % (chunk_index), args.debug)
        print_debug("Reading c_* files... done", args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)
        return True
Beispiel #26
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename and compute output file paths
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]
    ifo_file_path = base + ".ifo"
    idx_file_path = base + ".idx"
    dict_file_path = base + ".dict"
    dict_dz_file_path = base + ".dict.dz"
    syn_file_path = base + ".syn"

    # TODO by spec, the index should be sorted
    # TODO using the comparator stardict_strcmp() defined in the spec
    # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ),
    # TODO or with a user-defined collation function
    #
    # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp
    # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2);
    # Compare two strings, ignoring the case of ASCII characters.
    # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters.
    # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches.
    #
    # using Python's builtin lower() and sort() by headword
    # should be equivalent for UTF-8 encoded dictionaries (and it is fast)
    #
    dictionary.sort(by_headword=True, ignore_case=True)

    # write .idx and .dict files
    print_debug("Writing .idx and .dict files...", args.debug)
    idx_file_obj = io.open(idx_file_path, "wb")
    dict_file_obj = io.open(dict_file_path, "wb")
    current_offset = 0
    current_idx_size = 0
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        headword_bytes = entry.headword.encode("utf-8")
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        # write .idx
        idx_file_obj.write(headword_bytes)
        idx_file_obj.write(b"\0")
        idx_file_obj.write(struct.pack('>i', current_offset))
        idx_file_obj.write(struct.pack('>i', definition_size))
        current_idx_size += (len(headword_bytes) + 1 + 4 + 4)
        # write .dict
        dict_file_obj.write(definition_bytes)
        current_offset += definition_size
    idx_file_obj.close()
    dict_file_obj.close()
    print_debug("Writing .idx and .dict files... done", args.debug)

    # list files to compress
    files_to_compress = []
    files_to_compress.append(ifo_file_path)
    files_to_compress.append(idx_file_path)

    # write .syn file
    dict_syns_len = 0
    if dictionary.has_synonyms:
        if args.ignore_synonyms:
            print_debug("Dictionary has synonyms, but ignoring them",
                        args.debug)
        else:
            print_debug("Dictionary has synonyms, writing .syn file...",
                        args.debug)
            syn_file_obj = io.open(syn_file_path, "wb")
            dict_syns = dictionary.get_synonyms()
            dict_syns_len = len(dict_syns)
            for pair in dict_syns:
                synonym_bytes = pair[0].encode("utf-8")
                index = pair[1]
                syn_file_obj.write(synonym_bytes)
                syn_file_obj.write(b"\0")
                syn_file_obj.write(struct.pack('>i', index))
            syn_file_obj.close()
            files_to_compress.append(syn_file_path)
            print_debug("Dictionary has synonyms, writing .syn file... done",
                        args.debug)

    # compress .dict file
    if args.sd_no_dictzip:
        print_debug("Not compressing .dict file with dictzip", args.debug)
        files_to_compress.append(dict_file_path)
        result = [dict_file_path]
    else:
        try:
            print_debug("Compressing .dict file with dictzip...", args.debug)
            dictzip_path = DICTZIP
            if args.dictzip_path is None:
                print_info("  Running '%s' from $PATH" % DICTZIP)
            else:
                dictzip_path = args.dictzip_path
                print_info("  Running '%s' from '%s'" %
                           (DICTZIP, dictzip_path))
            proc = subprocess.Popen([dictzip_path, "-k", dict_file_path],
                                    stdout=subprocess.PIPE,
                                    stdin=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            proc.communicate()
            result = [dict_dz_file_path]
            files_to_compress.append(dict_dz_file_path)
            print_debug("Compressing .dict file with dictzip... done",
                        args.debug)
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" %
                        (DICTZIP, dictzip_path))
            print_error("  Please make sure '%s':" % DICTZIP)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --dictzip-path or")
            print_error(
                "    3. specify --no-dictzip to avoid compressing the .dict file"
            )
            result = None

    if result is not None:
        # create ifo file
        ifo_file_obj = io.open(ifo_file_path, "wb")
        ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8"))
        ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8"))
        ifo_file_obj.write(
            (u"wordcount=%d\n" % (len(dictionary))).encode("utf-8"))
        ifo_file_obj.write(
            (u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8"))
        ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8"))
        ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8"))
        ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8"))
        ifo_file_obj.write(
            (u"description=%s\n" % (args.description)).encode("utf-8"))
        ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8"))
        ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8"))
        ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8"))
        if dict_syns_len > 0:
            ifo_file_obj.write(
                (u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8"))
        ifo_file_obj.close()

        # create output zip file
        try:
            print_debug(
                "Writing to file '%s'..." % (output_file_path_absolute),
                args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w",
                                           zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
                print_debug("Written %s" % (file_to_compress), args.debug)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug(
                "Writing to file '%s'... success" %
                (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" %
                        (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #27
0
def main():
    parser = argparse.ArgumentParser(
        usage=USAGE,
        description=DESCRIPTION,
        epilog=EPILOG,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    for param in COMMAND_LINE_PARAMETERS:
        if param["short"] is None:
            parser.add_argument(param["long"],
                                help=param["help"],
                                action=param["action"],
                                default=argparse.SUPPRESS)
        else:
            parser.add_argument(param["short"],
                                param["long"],
                                help=param["help"],
                                action=param["action"],
                                default=argparse.SUPPRESS)
    arguments = parser.parse_args()

    # no arguments: show help and exit
    if len(sys.argv) < 2:
        parser.print_help()
        sys.exit(0)

    # print version and exit
    if "version" in arguments:
        print_info("Penelope v%s" % (__version__))
        sys.exit(0)

    # check we have all the required arguments
    # if not, it will sys.exit() with some error code
    check_arguments(arguments)

    # set default values
    set_default_values(arguments)
    print_debug(
        u"Running with the command line arguments:\n%s" % (str(arguments)),
        arguments.debug)

    # read raw dictionary
    print_info(u"Reading input file(s)...")
    dictionary = read_dictionary(arguments)
    if dictionary is None:
        print_error("Unable to read the input file(s)")
        sys.exit(8)
    print_info(u"Reading input file(s)... done")

    # apply custom input parser, if specified
    if arguments.input_parser is not None:
        input_parser = load_input_parser(arguments.input_parser)
        if input_parser is not None:
            print_info(u"Applying the specified input parser...")
            dictionary = input_parser.parse(dictionary, arguments)
            print_info(u"Applying the specified input parser... done")

    # sort dictionary before, if requested
    if arguments.sort_before:
        print_info(u"Sorting before...")
        dictionary.sort(arguments.sort_by_headword,
                        arguments.sort_by_definition, arguments.sort_reverse,
                        arguments.sort_ignore_case)
        print_info(u"Sorting before... done")

    # merge definitions, if requested
    if arguments.merge_definitions:
        print_info(u"Merging...")
        dictionary.merge_definitions(merge_separator=arguments.merge_separator)
        print_info(u"Merging... done")

    # flatten synonyms, if requested
    if arguments.flatten_synonyms:
        print_info(u"Flattening synonyms...")
        dictionary.flatten_synonyms()
        print_info(u"Flattening synonyms... done")

    # sort dictionary after, if requested
    if arguments.sort_after:
        print_info(u"Sorting after...")
        dictionary.sort(arguments.sort_by_headword,
                        arguments.sort_by_definition, arguments.sort_reverse,
                        arguments.sort_ignore_case)
        print_info(u"Sorting after... done")

    # output dictionary
    print_info(u"Writing output file(s)...")
    output_paths = write_dictionary(dictionary, arguments)
    if output_paths is None:
        print_error("Unable to write the output file(s)")
        sys.exit(16)
    print_info(u"Writing output file(s)... done")
    print_info(u"The following file(s) have been created:")
    for op in output_paths:
        print_info(u"  %s" % op)

    sys.exit(0)
Beispiel #28
0
def read(dictionary, args, input_file_paths):
    def find_files(entries):
        found = {}
        for entry in entries:
            if entry.endswith(".ifo"):
                found["d.ifo"] = entry
                break
        if not "d.ifo" in found:
            print_error("Cannot find .ifo file in the given StarDict file (see StarDict spec)")
            return {}
        # remove .ifo extension
        base = found["d.ifo"][:-4]
        # attempt to find these ones
        tentative_idx = base + ".idx"
        tentative_idx_gz = base + ".idx.gz"
        tentative_dict = base + ".dict"
        tentative_dict_dz = base + ".dict.dz"
        tentative_dz = base + ".dz"
        if tentative_idx in entries:
            found["d.idx"] = tentative_idx
        if tentative_idx_gz in entries:
            found["d.idx.gz"] = tentative_idx_gz
        if not (("d.idx" in found) or ("d.idx.gz" in found)):
            print_error("Cannot find .idx or .idx.gz file in the given StarDict file (see StarDict spec)")
            return {}
        if tentative_dict in entries:
            found["d.dict"] = tentative_dict
        if tentative_dict_dz in entries:
            found["d.dict.dz"] = tentative_dict_dz
        if tentative_dz in entries:
            found["d.dz"] = tentative_dz
        if not (("d.dict" in found) or ("d.dict.dz" in found) or ("d.dz" in found)):
            print_error("Cannot find .dict, .dict.dz, or .dz file in the given StarDict file (see StarDict spec)")
            return {}
        # syn is optional
        tentative_syn = base + ".syn"
        if tentative_syn in entries:
            found["d.syn"] = tentative_syn
        return found

    def uncompress_file(compressed_path, tmp_path, key):
        uncompressed_path = os.path.join(tmp_path, key)
        u_obj = open(uncompressed_path, "wb")
        c_obj = gzip.open(compressed_path, "rb")
        u_obj.write(c_obj.read())
        c_obj.close()
        u_obj.close()
        print_debug("Uncompressed %s" % (uncompressed_path), args.debug)
        return uncompressed_path

    def read_ifo(ifo_path, has_syn, args):
        ifo_dict = {}
        ifo_obj = open(ifo_path, "rb")
        ifo_bytes = ifo_obj.read() # bytes
        ifo_unicode = ifo_bytes.decode("utf-8") # unicode, always utf-8 by spec
        ifo_obj.close()
        for line in ifo_unicode.splitlines():
            array = line.split("=")
            if len(array) >= 2:
                key = array[0]
                val = "=".join(array[1:])
                ifo_dict[key] = val
        
        if not "version" in ifo_dict:
            print_error("No 'version' found in the .ifo file (see StarDict spec)")
            return None
        if ifo_dict["version"] not in ["2.4.2", "3.0.0"]:
            print_error("The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)")
            return None

        required_keys = ["bookname", "wordcount", "idxfilesize"]
        if has_syn:
            required_keys.append("synwordcount")
        # TODO not used => disabling this
        #if ifo_dict["version"] == "3.0.0":
        #    required_keys.append("idxoffsetbits")
        for key in required_keys:
            if not key in ifo_dict:
                print_error("No '%s' found in the .ifo file (see StarDict spec)" % key)
                return None

        ifo_dict["wordcount"] = int(ifo_dict["wordcount"])
        ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"])
        if has_syn:
            ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"])
        # TODO not used => disabling this
        #if ifo_dict["version"] == "3.0.0":
        #    ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"])

        if args.sd_ignore_sametypesequence:
            print_debug("Ignoring sametypesequence value", args.debug)
        else:
            # TODO limitation: we require sametypesequence to be present
            if not "sametypesequence" in ifo_dict:
                print_error("The .ifo file must have a 'sametypesequence' value (see README).")
                return None
            # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES
            if not ifo_dict["sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES:
                print_error("The .ifo file must have a 'sametypesequence' value of %s (see README)." % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES))
                return None

        return ifo_dict

    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp directory
        tmp_path = create_temp_directory()
        print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)

        # find .ifo, .idx, .dict[.dz] and .syn files inside the zip
        # and extract them to tmp_path
        input_file_obj = zipfile.ZipFile(input_file_path)
        found_files = find_files(input_file_obj.namelist())
        extracted_files = {}
        if len(found_files) > 0:
            for key in found_files:
                entry = found_files[key]
                ext_file_path = os.path.join(tmp_path, key)
                ext_file_obj = open(ext_file_path, "wb")
                zip_entry = input_file_obj.open(entry)
                ext_file_obj.write(zip_entry.read())
                zip_entry.close()
                ext_file_obj.close()
                print_debug("Extracted %s" % (ext_file_path), args.debug)
                extracted_files[key] = ext_file_path
                # extract from compressed file, but only if ".idx" is not present as well
                if (key == "d.idx.gz") and ("d.idx" not in found_files):
                    extracted_files["d.idx"] = uncompress_file(ext_file_path, tmp_path, "d.idx")
                # extract from compressed file, but only if ".dict" is not present as well
                if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files):
                    extracted_files["d.dict"] = uncompress_file(ext_file_path, tmp_path, "d.dict")
        input_file_obj.close()

        # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn

        has_syn = "d.syn" in extracted_files
        if (has_syn) and (args.ignore_synonyms):
            has_syn = False
            print_debug("Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug)
        ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args)
        print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug)

        # read dict file
        dict_file_obj = open(extracted_files["d.dict"], "rb")
        dict_file_bytes = dict_file_obj.read()
        dict_file_obj.close()

        # read idx file
        idx_file_obj = open(extracted_files["d.idx"], "rb")
        byte_read = idx_file_obj.read(1)
        headword = b""
        while byte_read:
            if byte_read == b"\0":
                # end of current word: read offset and size
                offset_bytes = idx_file_obj.read(4)
                offset_int = int((struct.unpack('>i', offset_bytes))[0])
                size_bytes = idx_file_obj.read(4)
                size_int = int((struct.unpack('>i', size_bytes))[0])
                definition = dict_file_bytes[offset_int:offset_int+size_int].decode(args.input_file_encoding)
                headword = headword.decode("utf-8")
                if args.ignore_case:
                    headword = headword.lower()
                dictionary.add_entry(headword=headword, definition=definition)
                headword = b""
            else:
                # read next byte
                headword += byte_read
            byte_read = idx_file_obj.read(1)
        idx_file_obj.close()
        result = True

        # read syn file, if present
        if has_syn:
            print_debug("The input StarDict file contains a .syn file, parsing it...", args.debug)
            result = False
            syn_file_obj = open(extracted_files["d.syn"], "rb")
            byte_read = syn_file_obj.read(1)
            synonym = b""
            while byte_read:
                if byte_read == b"\0":
                    # end of current synonym: read index of original word
                    index_bytes = syn_file_obj.read(4)
                    index_int = int((struct.unpack('>i', index_bytes))[0])
                    synonym = synonym.decode("utf-8")
                    if index_int < len(dictionary):
                        dictionary.add_synonym(synonym=synonym, headword_index=index_int)
                    else:
                        # emit a warning?
                        print_debug("Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug)
                    synonym = b""
                else:
                    # read next byte
                    synonym += byte_read
                byte_read = syn_file_obj.read(1)
            syn_file_obj.close()
            result = True
            print_debug("The input StarDict file contains a .syn file, parsing it... done", args.debug)
        else:
            print_debug("The input StarDict file does not contain a .syn file", args.debug)

        # delete tmp directory
        if args.keep:
            print_info("Not deleting temp dir '%s'" % (tmp_path))
        else:
            delete_directory(tmp_path)
            print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

        return result

    for input_file_path in input_file_paths:
        print_debug("Reading from file '%s'..." % (input_file_path), args.debug)
        result = read_single_file(dictionary, args, input_file_path)
        if result:
            print_debug("Reading from file '%s'... success" % (input_file_path), args.debug)
        else:
            print_error("Reading from file '%s'... failed" % (input_file_path))
            return None
    return dictionary
Beispiel #29
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # get the basename and compute output file paths
    base = os.path.basename(output_file_path)
    if base.endswith(".zip"):
        base = base[:-4]
    ifo_file_path = base + ".ifo"
    idx_file_path = base + ".idx"
    dict_file_path = base + ".dict"
    dict_dz_file_path = base + ".dict.dz"
    syn_file_path = base + ".syn"

    # TODO by spec, the index should be sorted
    # TODO using the comparator stardict_strcmp() defined in the spec
    # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ),
    # TODO or with a user-defined collation function
    #
    # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp
    # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2);
    # Compare two strings, ignoring the case of ASCII characters.
    # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters.
    # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches. 
    #
    # using Python's builtin lower() and sort() by headword
    # should be equivalent for UTF-8 encoded dictionaries (and it is fast)
    #
    dictionary.sort(by_headword=True, ignore_case=True)

    # write .idx and .dict files
    print_debug("Writing .idx and .dict files...", args.debug)
    idx_file_obj = open(idx_file_path, "wb")
    dict_file_obj = open(dict_file_path, "wb")
    current_offset = 0
    current_idx_size = 0
    for entry_index in dictionary.entries_index_sorted:
        entry = dictionary.entries[entry_index]
        headword_bytes = entry.headword.encode("utf-8")
        definition_bytes = entry.definition.encode("utf-8")
        definition_size = len(definition_bytes)
        # write .idx
        idx_file_obj.write(headword_bytes)
        idx_file_obj.write(b"\0")
        idx_file_obj.write(struct.pack('>i', current_offset))
        idx_file_obj.write(struct.pack('>i', definition_size))
        current_idx_size += (len(headword_bytes) + 1 + 4 + 4)
        # write .dict
        dict_file_obj.write(definition_bytes)
        current_offset += definition_size
    idx_file_obj.close()
    dict_file_obj.close()
    print_debug("Writing .idx and .dict files... done", args.debug)

    # list files to compress
    files_to_compress = []
    files_to_compress.append(ifo_file_path)
    files_to_compress.append(idx_file_path)

    # write .syn file
    dict_syns_len = 0
    if dictionary.has_synonyms:
        if args.ignore_synonyms:
            print_debug("Dictionary has synonyms, but ignoring them", args.debug)
        else:
            print_debug("Dictionary has synonyms, writing .syn file...", args.debug)
            syn_file_obj = open(syn_file_path, "wb")
            dict_syns = dictionary.get_synonyms()
            dict_syns_len = len(dict_syns)
            for pair in dict_syns:
                synonym_bytes = pair[0].encode("utf-8")
                index = pair[1]
                syn_file_obj.write(synonym_bytes)
                syn_file_obj.write(b"\0")
                syn_file_obj.write(struct.pack('>i', index))
            syn_file_obj.close()
            files_to_compress.append(syn_file_path)
            print_debug("Dictionary has synonyms, writing .syn file... done", args.debug)

    # compress .dict file
    if args.sd_no_dictzip:
        print_debug("Not compressing .dict file with dictzip", args.debug)
        files_to_compress.append(dict_file_path)
        result = [dict_file_path] 
    else:
        try:
            print_debug("Compressing .dict file with dictzip...", args.debug)
            dictzip_path = DICTZIP
            if args.dictzip_path is None:
                print_info("  Running '%s' from $PATH" % DICTZIP)
            else:
                dictzip_path = args.dictzip_path
                print_info("  Running '%s' from '%s'" % (DICTZIP, dictzip_path))
            proc = subprocess.Popen(
                [dictzip_path, "-k", dict_file_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            proc.communicate()
            result = [dict_dz_file_path] 
            files_to_compress.append(dict_dz_file_path)
            print_debug("Compressing .dict file with dictzip... done", args.debug)
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" % (DICTZIP, dictzip_path))
            print_error("  Please make sure '%s':" % DICTZIP)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --dictzip-path or")
            print_error("    3. specify --no-dictzip to avoid compressing the .dict file")
            result = None 

    if result is not None:
        # create ifo file
        ifo_file_obj = open(ifo_file_path, "wb")
        ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8"))
        ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8"))
        ifo_file_obj.write((u"wordcount=%d\n" % (len(dictionary))).encode("utf-8"))
        ifo_file_obj.write((u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8"))
        ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8"))
        ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8"))
        ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8"))
        ifo_file_obj.write((u"description=%s\n" % (args.description)).encode("utf-8"))
        ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8"))
        ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8"))
        ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8"))
        if dict_syns_len > 0:
            ifo_file_obj.write((u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8"))
        ifo_file_obj.close()

        # create output zip file
        try:
            print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
                print_debug("Written %s" % (file_to_compress), args.debug)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" % (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Beispiel #30
0
    def read_ifo(ifo_path, has_syn, args):
        ifo_dict = {}
        ifo_obj = io.open(ifo_path, "rb")
        ifo_bytes = ifo_obj.read()  # bytes
        ifo_unicode = ifo_bytes.decode(
            "utf-8")  # unicode, always utf-8 by spec
        ifo_obj.close()
        for line in ifo_unicode.splitlines():
            array = line.split("=")
            if len(array) >= 2:
                key = array[0]
                val = "=".join(array[1:])
                ifo_dict[key] = val

        if "version" not in ifo_dict:
            print_error(
                "No 'version' found in the .ifo file (see StarDict spec)")
            return None
        if ifo_dict["version"] not in ["2.4.2", "3.0.0"]:
            print_error(
                "The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)"
            )
            return None

        required_keys = ["bookname", "wordcount", "idxfilesize"]
        if has_syn:
            required_keys.append("synwordcount")
        # TODO not used => disabling this
        # if ifo_dict["version"] == "3.0.0":
        #     required_keys.append("idxoffsetbits")
        for key in required_keys:
            if key not in ifo_dict:
                print_error(
                    "No '%s' found in the .ifo file (see StarDict spec)" % key)
                return None

        ifo_dict["wordcount"] = int(ifo_dict["wordcount"])
        ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"])
        if has_syn:
            ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"])
        # TODO not used => disabling this
        # if ifo_dict["version"] == "3.0.0":
        #     ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"])

        if args.sd_ignore_sametypesequence:
            print_debug("Ignoring sametypesequence value", args.debug)
        else:
            # TODO limitation: we require sametypesequence to be present
            if "sametypesequence" not in ifo_dict:
                print_error(
                    "The .ifo file must have a 'sametypesequence' value (see README)."
                )
                return None
            # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES
            if not ifo_dict[
                    "sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES:
                print_error(
                    "The .ifo file must have a 'sametypesequence' value of %s (see README)."
                    % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES))
                return None

        return ifo_dict