def delete(self): if self.root_directory_path is not None: delete_directory(self.root_directory_path)
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # get absolute path for collation function file bookeen_collation_function_path = None if args.bookeen_collation_function is not None: bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] # copy empty.idx into tmp_path idx_file_path = base + u".dict.idx" dict_file_path = base + u".dict" copy_file(EMPTY_FILE_PATH, idx_file_path) # open index sql_connection = sqlite3.connect(idx_file_path) # install collation in the index collation_function = collate_function_default if bookeen_collation_function_path is not None: try: collation_function = imp.load_source("", bookeen_collation_function_path).collate_function print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug) except: print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path)) sql_connection.create_collation("IcuNoCase", collation_function) sql_connection.text_factory = str # get a cursor and delete any data from the index file sql_cursor = sql_connection.cursor() sql_cursor.execute("delete from T_DictIndex") # write c_* files # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively) print_debug("Writing c_* files...", args.debug) files_to_compress = [] current_offset = 0 chunk_index = 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = io.open(chunk_file_path, "wb") for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) chunk_file_obj.write(definition_bytes) # insert headword into index file sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # insert synonyms into index file if not args.ignore_synonyms: for synonym in entry.get_synonyms(): sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # update offset current_offset += definition_size # if we reached CHUNK_SIZE, open the next c_* file if current_offset > CHUNK_SIZE: chunk_file_obj.close() chunk_index += 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = io.open(chunk_file_path, "wb") current_offset = 0 chunk_file_obj.close() print_debug("Writing c_* files... done", args.debug) # compress print_debug("Compressing c_* files...", args.debug) file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() print_debug("Compressing c_* files... done", args.debug) # update index metadata print_debug("Updating index metadata...", args.debug) header = HEADER % (args.language_from) sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,)) sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,)) sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,)) sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,)) sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,)) sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,)) sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,)) sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,)) # the meaning of the following is unknown sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",)) sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",)) sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",)) sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",)) print_debug("Updating index metadata... done", args.debug) # compact and close sql_cursor.execute("vacuum") sql_cursor.close() sql_connection.close() # create .install file or copy .dict.idx and .dict into requested output directory parent_output_directory = os.path.split(output_file_path_absolute)[0] if args.bookeen_install_file: print_debug("Creating .install file...", args.debug) file_zip_path = os.path.join(parent_output_directory, base + u".install") file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in [dict_file_path, idx_file_path]: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [file_zip_path] print_debug("Creating .install file... done", args.debug) else: print_debug("Copying .dict.idx and .dict files...", args.debug) dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path)) idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path)) copy_file(dict_file_path, dict_file_path_final) copy_file(idx_file_path, idx_file_path_final) result = [idx_file_path_final, dict_file_path_final] print_debug("Copying .dict.idx and .dict files... done", args.debug) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_dict(dictionary, args, single_dict): # create tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) if len(single_dict) == 1: print_debug("Unzipping .install file...", args.debug) zip_file_path = single_dict[0] idx_file_path = os.path.join(tmp_path, "d.dict.idx") dict_file_path = os.path.join(tmp_path, "d.dict") zip_file_obj = zipfile.ZipFile(zip_file_path, "r") for entry in zip_file_obj.namelist(): if entry.endswith(".dict.idx"): zip_entry = zip_file_obj.open(entry) idx_file_obj = io.open(idx_file_path, "wb") idx_file_obj.write(zip_entry.read()) idx_file_obj.close() zip_entry.close() elif entry.endswith(".dict"): zip_entry = zip_file_obj.open(entry) dict_file_obj = io.open(dict_file_path, "wb") dict_file_obj.write(zip_entry.read()) dict_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .install file... done", args.debug) else: print_debug("Files .dict.idx and .dict already uncompressed...", args.debug) idx_file_path = single_dict[0] dict_file_path = single_dict[1] for file_path in [idx_file_path, dict_file_path]: if not os.path.exists(file_path): print_error("File '%s' does not exist" % file_path) return False print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug) # unzip .dict file into tmp_path print_debug("Unzipping .dict file...", args.debug) zip_file_obj = zipfile.ZipFile(dict_file_path, "r") for entry in zip_file_obj.namelist(): if not entry.endswith("/"): zip_entry = zip_file_obj.open(entry) entry_file_path = os.path.join(tmp_path, os.path.basename(entry)) entry_file_obj = io.open(entry_file_path, "wb") entry_file_obj.write(zip_entry.read()) entry_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .dict file... done", args.debug) # read .dict.idx print_debug("Reading .dict.idx file...", args.debug) sql_connection = sqlite3.connect(idx_file_path) sql_cursor = sql_connection.cursor() sql_cursor.execute("select * from T_DictIndex") index_data = sql_cursor.fetchall() chunk_index_to_entries = {} max_chunk_index = 1 for index_entry in index_data: headword = index_entry[1] if args.ignore_case: headword = headword.lower() offset = index_entry[2] size = index_entry[3] chunk_index = index_entry[4] if chunk_index not in chunk_index_to_entries: chunk_index_to_entries[chunk_index] = [] if chunk_index > max_chunk_index: max_chunk_index = chunk_index chunk_index_to_entries[chunk_index].append([headword, offset, size]) sql_cursor.close() sql_connection.close() print_debug("Reading .dict.idx file... done", args.debug) # read c_* files print_debug("Reading c_* files...", args.debug) for chunk_index in range(1, max_chunk_index + 1): print_debug(" Reading c_%d file..." % (chunk_index), args.debug) chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)) chunk_file_obj = io.open(chunk_file_path, "rb") for entry in chunk_index_to_entries[chunk_index]: headword = entry[0] offset = entry[1] size = entry[2] chunk_file_obj.seek(offset) definition_bytes = chunk_file_obj.read(size) definition_unicode = definition_bytes.decode(args.input_file_encoding) dictionary.add_entry(headword=headword, definition=definition_unicode) chunk_file_obj.close() print_debug(" Reading c_%d file... done" % (chunk_index), args.debug) print_debug("Reading c_* files... done", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return True
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # sort by headword dictionary.sort(by_headword=True) # group by prefix files_to_compress = [] prefix_length = int(args.group_by_prefix_length) special_group, group_keys, group_dict = dictionary.group( prefix_function=get_prefix_kobo, prefix_length=prefix_length, merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first ) if special_group is not None: special_group_key = u"1" * prefix_length group_dict[special_group_key] = special_group group_keys = [special_group_key] + group_keys # write files for key in group_keys: # write html file file_html_path = key + u".html" file_html_obj = io.open(file_html_path, "wb") file_html_obj.write(u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode("utf-8")) for entry in group_dict[key]: headword = entry.headword definition = entry.definition file_html_obj.write((u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" % (headword, headword, definition)).encode("utf-8")) file_html_obj.write((u"</html>").encode("utf-8")) file_html_obj.close() # compress in gz format file_html_obj = io.open(file_html_path, "rb") file_gz_path = file_html_path + u".gz" file_gz_obj = gzip.open(file_gz_path, "wb") file_gz_obj.writelines(file_html_obj) file_gz_obj.close() file_html_obj.close() # delete .html file delete_file(None, file_html_path) # rename .html.gz file into .html rename_file(file_gz_path, file_html_path) files_to_compress.append(file_html_path) # write words file_words_path = WORDS_FILE_NAME keys = sorted(dictionary.entries_index.keys()) try: import marisa_trie trie = marisa_trie.Trie(keys) trie.save(file_words_path) result = [file_words_path] except ImportError as exc: # call MARISA with subprocess print_info(" MARISA cannot be imported as Python module. You might want to install it with:") print_info(" $ [sudo] pip install marisa_trie") marisa_build_path = MARISA_BUILD if args.marisa_bin_path is None: print_info(" Running '%s' from $PATH" % MARISA_BUILD) else: marisa_build_path = os.path.join(args.marisa_bin_path, MARISA_BUILD) print_info(" Running '%s' from '%s'" % (MARISA_BUILD, args.marisa_bin_path)) # TODO this is ugly, but it works query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8") try: proc = subprocess.Popen( [marisa_build_path, "-l", "-o", file_words_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE ) proc.communicate(input=query)[0].decode("utf-8") result = [file_words_path] except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (MARISA_BUILD, marisa_build_path)) print_error(" Please make sure '%s':" % MARISA_BUILD) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --marisa-bin-path or") print_error(" 3. install the marisa_trie Python module") result = None if result is not None: # add file_words_path to files to compress files_to_compress.append(file_words_path) # create output zip file try: print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [output_file_path] print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) # find .ifo, .idx, .dict[.dz] and .syn files inside the zip # and extract them to tmp_path input_file_obj = zipfile.ZipFile(input_file_path) found_files = find_files(input_file_obj.namelist()) extracted_files = {} if len(found_files) > 0: for key in found_files: entry = found_files[key] ext_file_path = os.path.join(tmp_path, key) ext_file_obj = open(ext_file_path, "wb") zip_entry = input_file_obj.open(entry) ext_file_obj.write(zip_entry.read()) zip_entry.close() ext_file_obj.close() print_debug("Extracted %s" % (ext_file_path), args.debug) extracted_files[key] = ext_file_path # extract from compressed file, but only if ".idx" is not present as well if (key == "d.idx.gz") and ("d.idx" not in found_files): extracted_files["d.idx"] = uncompress_file(ext_file_path, tmp_path, "d.idx") # extract from compressed file, but only if ".dict" is not present as well if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files): extracted_files["d.dict"] = uncompress_file(ext_file_path, tmp_path, "d.dict") input_file_obj.close() # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn has_syn = "d.syn" in extracted_files if (has_syn) and (args.ignore_synonyms): has_syn = False print_debug("Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug) ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args) print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug) # read dict file dict_file_obj = open(extracted_files["d.dict"], "rb") dict_file_bytes = dict_file_obj.read() dict_file_obj.close() # read idx file idx_file_obj = open(extracted_files["d.idx"], "rb") byte_read = idx_file_obj.read(1) headword = b"" while byte_read: if byte_read == b"\0": # end of current word: read offset and size offset_bytes = idx_file_obj.read(4) offset_int = int((struct.unpack('>i', offset_bytes))[0]) size_bytes = idx_file_obj.read(4) size_int = int((struct.unpack('>i', size_bytes))[0]) definition = dict_file_bytes[offset_int:offset_int+size_int].decode(args.input_file_encoding) headword = headword.decode("utf-8") if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) headword = b"" else: # read next byte headword += byte_read byte_read = idx_file_obj.read(1) idx_file_obj.close() result = True # read syn file, if present if has_syn: print_debug("The input StarDict file contains a .syn file, parsing it...", args.debug) result = False syn_file_obj = open(extracted_files["d.syn"], "rb") byte_read = syn_file_obj.read(1) synonym = b"" while byte_read: if byte_read == b"\0": # end of current synonym: read index of original word index_bytes = syn_file_obj.read(4) index_int = int((struct.unpack('>i', index_bytes))[0]) synonym = synonym.decode("utf-8") if index_int < len(dictionary): dictionary.add_synonym(synonym=synonym, headword_index=index_int) else: # emit a warning? print_debug("Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug) synonym = b"" else: # read next byte synonym += byte_read byte_read = syn_file_obj.read(1) syn_file_obj.close() result = True print_debug("The input StarDict file contains a .syn file, parsing it... done", args.debug) else: print_debug("The input StarDict file does not contain a .syn file", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename and compute output file paths base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] ifo_file_path = base + ".ifo" idx_file_path = base + ".idx" dict_file_path = base + ".dict" dict_dz_file_path = base + ".dict.dz" syn_file_path = base + ".syn" # TODO by spec, the index should be sorted # TODO using the comparator stardict_strcmp() defined in the spec # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ), # TODO or with a user-defined collation function # # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2); # Compare two strings, ignoring the case of ASCII characters. # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters. # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches. # # using Python's builtin lower() and sort() by headword # should be equivalent for UTF-8 encoded dictionaries (and it is fast) # dictionary.sort(by_headword=True, ignore_case=True) # write .idx and .dict files print_debug("Writing .idx and .dict files...", args.debug) idx_file_obj = open(idx_file_path, "wb") dict_file_obj = open(dict_file_path, "wb") current_offset = 0 current_idx_size = 0 for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] headword_bytes = entry.headword.encode("utf-8") definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) # write .idx idx_file_obj.write(headword_bytes) idx_file_obj.write(b"\0") idx_file_obj.write(struct.pack('>i', current_offset)) idx_file_obj.write(struct.pack('>i', definition_size)) current_idx_size += (len(headword_bytes) + 1 + 4 + 4) # write .dict dict_file_obj.write(definition_bytes) current_offset += definition_size idx_file_obj.close() dict_file_obj.close() print_debug("Writing .idx and .dict files... done", args.debug) # list files to compress files_to_compress = [] files_to_compress.append(ifo_file_path) files_to_compress.append(idx_file_path) # write .syn file dict_syns_len = 0 if dictionary.has_synonyms: if args.ignore_synonyms: print_debug("Dictionary has synonyms, but ignoring them", args.debug) else: print_debug("Dictionary has synonyms, writing .syn file...", args.debug) syn_file_obj = open(syn_file_path, "wb") dict_syns = dictionary.get_synonyms() dict_syns_len = len(dict_syns) for pair in dict_syns: synonym_bytes = pair[0].encode("utf-8") index = pair[1] syn_file_obj.write(synonym_bytes) syn_file_obj.write(b"\0") syn_file_obj.write(struct.pack('>i', index)) syn_file_obj.close() files_to_compress.append(syn_file_path) print_debug("Dictionary has synonyms, writing .syn file... done", args.debug) # compress .dict file if args.sd_no_dictzip: print_debug("Not compressing .dict file with dictzip", args.debug) files_to_compress.append(dict_file_path) result = [dict_file_path] else: try: print_debug("Compressing .dict file with dictzip...", args.debug) dictzip_path = DICTZIP if args.dictzip_path is None: print_info(" Running '%s' from $PATH" % DICTZIP) else: dictzip_path = args.dictzip_path print_info(" Running '%s' from '%s'" % (DICTZIP, dictzip_path)) proc = subprocess.Popen( [dictzip_path, "-k", dict_file_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE ) proc.communicate() result = [dict_dz_file_path] files_to_compress.append(dict_dz_file_path) print_debug("Compressing .dict file with dictzip... done", args.debug) except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (DICTZIP, dictzip_path)) print_error(" Please make sure '%s':" % DICTZIP) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --dictzip-path or") print_error(" 3. specify --no-dictzip to avoid compressing the .dict file") result = None if result is not None: # create ifo file ifo_file_obj = open(ifo_file_path, "wb") ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8")) ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8")) ifo_file_obj.write((u"wordcount=%d\n" % (len(dictionary))).encode("utf-8")) ifo_file_obj.write((u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8")) ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8")) ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8")) ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8")) ifo_file_obj.write((u"description=%s\n" % (args.description)).encode("utf-8")) ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8")) ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8")) ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8")) if dict_syns_len > 0: ifo_file_obj.write((u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8")) ifo_file_obj.close() # create output zip file try: print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) print_debug("Written %s" % (file_to_compress), args.debug) file_zip_obj.close() result = [output_file_path] print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # sort by headword dictionary.sort(by_headword=True) # group by prefix files_to_compress = [] prefix_length = int(args.group_by_prefix_length) special_group, group_keys, group_dict = dictionary.group( prefix_function=get_prefix_kobo, prefix_length=prefix_length, merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first) if special_group is not None: special_group_key = u"1" * prefix_length group_dict[special_group_key] = special_group group_keys = [special_group_key] + group_keys # write files for key in group_keys: # write html file file_html_path = key + u".html" file_html_obj = io.open(file_html_path, "wb") file_html_obj.write( u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode( "utf-8")) for entry in group_dict[key]: headword = entry.headword definition = entry.definition file_html_obj.write( (u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" % (headword, headword, definition)).encode("utf-8")) file_html_obj.write((u"</html>").encode("utf-8")) file_html_obj.close() # compress in gz format file_html_obj = io.open(file_html_path, "rb") file_gz_path = file_html_path + u".gz" file_gz_obj = gzip.open(file_gz_path, "wb") file_gz_obj.writelines(file_html_obj) file_gz_obj.close() file_html_obj.close() # delete .html file delete_file(None, file_html_path) # rename .html.gz file into .html rename_file(file_gz_path, file_html_path) files_to_compress.append(file_html_path) # write words file_words_path = WORDS_FILE_NAME keys = sorted(dictionary.entries_index.keys()) try: import marisa_trie trie = marisa_trie.Trie(keys) trie.save(file_words_path) result = [file_words_path] except ImportError as exc: # call MARISA with subprocess print_info( " MARISA cannot be imported as Python module. You might want to install it with:" ) print_info(" $ [sudo] pip install marisa_trie") marisa_build_path = MARISA_BUILD if args.marisa_bin_path is None: print_info(" Running '%s' from $PATH" % MARISA_BUILD) else: marisa_build_path = os.path.join(args.marisa_bin_path, MARISA_BUILD) print_info(" Running '%s' from '%s'" % (MARISA_BUILD, args.marisa_bin_path)) # TODO this is ugly, but it works query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8") try: proc = subprocess.Popen( [marisa_build_path, "-l", "-o", file_words_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate(input=query)[0].decode("utf-8") result = [file_words_path] except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (MARISA_BUILD, marisa_build_path)) print_error(" Please make sure '%s':" % MARISA_BUILD) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --marisa-bin-path or") print_error(" 3. install the marisa_trie Python module") result = None if result is not None: # add file_words_path to files to compress files_to_compress.append(file_words_path) # create output zip file try: print_debug( "Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [output_file_path] print_debug( "Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_dict(dictionary, args, single_dict): # create tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) if len(single_dict) == 1: print_debug("Unzipping .install file...", args.debug) zip_file_path = single_dict[0] idx_file_path = os.path.join(tmp_path, "d.dict.idx") dict_file_path = os.path.join(tmp_path, "d.dict") zip_file_obj = zipfile.ZipFile(zip_file_path, "r") for entry in zip_file_obj.namelist(): if entry.endswith(".dict.idx"): zip_entry = zip_file_obj.open(entry) idx_file_obj = open(idx_file_path, "wb") idx_file_obj.write(zip_entry.read()) idx_file_obj.close() zip_entry.close() elif entry.endswith(".dict"): zip_entry = zip_file_obj.open(entry) dict_file_obj = open(dict_file_path, "wb") dict_file_obj.write(zip_entry.read()) dict_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .install file... done", args.debug) else: print_debug("Files .dict.idx and .dict already uncompressed...", args.debug) idx_file_path = single_dict[0] dict_file_path = single_dict[1] for file_path in [idx_file_path, dict_file_path]: if not os.path.exists(file_path): print_error("File '%s' does not exist" % file_path) return False print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug) # unzip .dict file into tmp_path print_debug("Unzipping .dict file...", args.debug) zip_file_obj = zipfile.ZipFile(dict_file_path, "r") for entry in zip_file_obj.namelist(): if not entry.endswith("/"): zip_entry = zip_file_obj.open(entry) entry_file_path = os.path.join(tmp_path, os.path.basename(entry)) entry_file_obj = open(entry_file_path, "wb") entry_file_obj.write(zip_entry.read()) entry_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .dict file... done", args.debug) # read .dict.idx print_debug("Reading .dict.idx file...", args.debug) sql_connection = sqlite3.connect(idx_file_path) sql_cursor = sql_connection.cursor() sql_cursor.execute("select * from T_DictIndex") index_data = sql_cursor.fetchall() chunk_index_to_entries = {} max_chunk_index = 1 for index_entry in index_data: headword = index_entry[1] if args.ignore_case: headword = headword.lower() offset = index_entry[2] size = index_entry[3] chunk_index = index_entry[4] if not chunk_index in chunk_index_to_entries: chunk_index_to_entries[chunk_index] = [] if chunk_index > max_chunk_index: max_chunk_index = chunk_index chunk_index_to_entries[chunk_index].append([headword, offset, size]) sql_cursor.close() sql_connection.close() print_debug("Reading .dict.idx file... done", args.debug) # read c_* files print_debug("Reading c_* files...", args.debug) for chunk_index in range(1, max_chunk_index + 1): print_debug(" Reading c_%d file..." % (chunk_index), args.debug) chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)) chunk_file_obj = open(chunk_file_path, "rb") for entry in chunk_index_to_entries[chunk_index]: headword = entry[0] offset = entry[1] size = entry[2] chunk_file_obj.seek(offset) definition_bytes = chunk_file_obj.read(size) definition_unicode = definition_bytes.decode(args.input_file_encoding) dictionary.add_entry(headword=headword, definition=definition_unicode) chunk_file_obj.close() print_debug(" Reading c_%d file... done" % (chunk_index), args.debug) print_debug("Reading c_* files... done", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return True
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # get absolute path for collation function file bookeen_collation_function_path = None if args.bookeen_collation_function is not None: bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] # copy empty.idx into tmp_path idx_file_path = base + u".dict.idx" dict_file_path = base + u".dict" copy_file(EMPTY_FILE_PATH, idx_file_path) # open index sql_connection = sqlite3.connect(idx_file_path) # install collation in the index collation_function = collate_function_default if bookeen_collation_function_path is not None: try: collation_function = imp.load_source("", bookeen_collation_function_path).collate_function print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug) except: print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path)) sql_connection.create_collation("IcuNoCase", collation_function) sql_connection.text_factory = str # get a cursor and delete any data from the index file sql_cursor = sql_connection.cursor() sql_cursor.execute("delete from T_DictIndex") # write c_* files # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively) print_debug("Writing c_* files...", args.debug) files_to_compress = [] current_offset = 0 chunk_index = 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = open(chunk_file_path, "wb") for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) chunk_file_obj.write(definition_bytes) # insert headword into index file sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # insert synonyms into index file if not args.ignore_synonyms: for synonym in entry.get_synonyms(): sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # update offset current_offset += definition_size # if we reached CHUNK_SIZE, open the next c_* file if current_offset > CHUNK_SIZE: chunk_file_obj.close() chunk_index += 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = open(chunk_file_path, "wb") current_offset = 0 chunk_file_obj.close() print_debug("Writing c_* files... done", args.debug) # compress print_debug("Compressing c_* files...", args.debug) file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() print_debug("Compressing c_* files... done", args.debug) # update index metadata print_debug("Updating index metadata...", args.debug) header = HEADER % (args.language_from) sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,)) sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,)) sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,)) sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,)) sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,)) sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,)) sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,)) sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,)) # the meaning of the following is unknown sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",)) sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",)) sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",)) sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",)) print_debug("Updating index metadata... done", args.debug) # compact and close sql_cursor.execute("vacuum") sql_cursor.close() sql_connection.close() # create .install file or copy .dict.idx and .dict into requested output directory parent_output_directory = os.path.split(output_file_path_absolute)[0] if args.bookeen_install_file: print_debug("Creating .install file...", args.debug) file_zip_path = os.path.join(parent_output_directory, base + u".install") file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in [dict_file_path, idx_file_path]: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [file_zip_path] print_debug("Creating .install file... done", args.debug) else: print_debug("Copying .dict.idx and .dict files...", args.debug) dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path)) idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path)) copy_file(dict_file_path, dict_file_path_final) copy_file(idx_file_path, idx_file_path_final) result = [idx_file_path_final, dict_file_path_final] print_debug("Copying .dict.idx and .dict files... done", args.debug) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename and compute output file paths base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] ifo_file_path = base + ".ifo" idx_file_path = base + ".idx" dict_file_path = base + ".dict" dict_dz_file_path = base + ".dict.dz" syn_file_path = base + ".syn" # TODO by spec, the index should be sorted # TODO using the comparator stardict_strcmp() defined in the spec # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ), # TODO or with a user-defined collation function # # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2); # Compare two strings, ignoring the case of ASCII characters. # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters. # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches. # # using Python's builtin lower() and sort() by headword # should be equivalent for UTF-8 encoded dictionaries (and it is fast) # dictionary.sort(by_headword=True, ignore_case=True) # write .idx and .dict files print_debug("Writing .idx and .dict files...", args.debug) idx_file_obj = io.open(idx_file_path, "wb") dict_file_obj = io.open(dict_file_path, "wb") current_offset = 0 current_idx_size = 0 for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] headword_bytes = entry.headword.encode("utf-8") definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) # write .idx idx_file_obj.write(headword_bytes) idx_file_obj.write(b"\0") idx_file_obj.write(struct.pack('>i', current_offset)) idx_file_obj.write(struct.pack('>i', definition_size)) current_idx_size += (len(headword_bytes) + 1 + 4 + 4) # write .dict dict_file_obj.write(definition_bytes) current_offset += definition_size idx_file_obj.close() dict_file_obj.close() print_debug("Writing .idx and .dict files... done", args.debug) # list files to compress files_to_compress = [] files_to_compress.append(ifo_file_path) files_to_compress.append(idx_file_path) # write .syn file dict_syns_len = 0 if dictionary.has_synonyms: if args.ignore_synonyms: print_debug("Dictionary has synonyms, but ignoring them", args.debug) else: print_debug("Dictionary has synonyms, writing .syn file...", args.debug) syn_file_obj = io.open(syn_file_path, "wb") dict_syns = dictionary.get_synonyms() dict_syns_len = len(dict_syns) for pair in dict_syns: synonym_bytes = pair[0].encode("utf-8") index = pair[1] syn_file_obj.write(synonym_bytes) syn_file_obj.write(b"\0") syn_file_obj.write(struct.pack('>i', index)) syn_file_obj.close() files_to_compress.append(syn_file_path) print_debug("Dictionary has synonyms, writing .syn file... done", args.debug) # compress .dict file if args.sd_no_dictzip: print_debug("Not compressing .dict file with dictzip", args.debug) files_to_compress.append(dict_file_path) result = [dict_file_path] else: try: print_debug("Compressing .dict file with dictzip...", args.debug) dictzip_path = DICTZIP if args.dictzip_path is None: print_info(" Running '%s' from $PATH" % DICTZIP) else: dictzip_path = args.dictzip_path print_info(" Running '%s' from '%s'" % (DICTZIP, dictzip_path)) proc = subprocess.Popen([dictzip_path, "-k", dict_file_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() result = [dict_dz_file_path] files_to_compress.append(dict_dz_file_path) print_debug("Compressing .dict file with dictzip... done", args.debug) except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (DICTZIP, dictzip_path)) print_error(" Please make sure '%s':" % DICTZIP) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --dictzip-path or") print_error( " 3. specify --no-dictzip to avoid compressing the .dict file" ) result = None if result is not None: # create ifo file ifo_file_obj = io.open(ifo_file_path, "wb") ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8")) ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8")) ifo_file_obj.write( (u"wordcount=%d\n" % (len(dictionary))).encode("utf-8")) ifo_file_obj.write( (u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8")) ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8")) ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8")) ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8")) ifo_file_obj.write( (u"description=%s\n" % (args.description)).encode("utf-8")) ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8")) ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8")) ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8")) if dict_syns_len > 0: ifo_file_obj.write( (u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8")) ifo_file_obj.close() # create output zip file try: print_debug( "Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) print_debug("Written %s" % (file_to_compress), args.debug) file_zip_obj.close() result = [output_file_path] print_debug( "Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) # find .ifo, .idx, .dict[.dz] and .syn files inside the zip # and extract them to tmp_path input_file_obj = zipfile.ZipFile(input_file_path) found_files = find_files(input_file_obj.namelist()) extracted_files = {} if len(found_files) > 0: for key in found_files: entry = found_files[key] ext_file_path = os.path.join(tmp_path, key) ext_file_obj = io.open(ext_file_path, "wb") zip_entry = input_file_obj.open(entry) ext_file_obj.write(zip_entry.read()) zip_entry.close() ext_file_obj.close() print_debug("Extracted %s" % (ext_file_path), args.debug) extracted_files[key] = ext_file_path # extract from compressed file, but only if ".idx" is not present as well if (key == "d.idx.gz") and ("d.idx" not in found_files): extracted_files["d.idx"] = uncompress_file( ext_file_path, tmp_path, "d.idx") # extract from compressed file, but only if ".dict" is not present as well if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files): extracted_files["d.dict"] = uncompress_file( ext_file_path, tmp_path, "d.dict") input_file_obj.close() # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn has_syn = "d.syn" in extracted_files if (has_syn) and (args.ignore_synonyms): has_syn = False print_debug( "Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug) ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args) print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug) # read dict file dict_file_obj = io.open(extracted_files["d.dict"], "rb") dict_file_bytes = dict_file_obj.read() dict_file_obj.close() # read idx file idx_file_obj = io.open(extracted_files["d.idx"], "rb") byte_read = idx_file_obj.read(1) headword = b"" while byte_read: if byte_read == b"\0": # end of current word: read offset and size offset_bytes = idx_file_obj.read(4) offset_int = int((struct.unpack('>i', offset_bytes))[0]) size_bytes = idx_file_obj.read(4) size_int = int((struct.unpack('>i', size_bytes))[0]) definition = dict_file_bytes[offset_int:( offset_int + size_int)].decode(args.input_file_encoding) headword = headword.decode("utf-8") if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) headword = b"" else: # read next byte headword += byte_read byte_read = idx_file_obj.read(1) idx_file_obj.close() result = True # read syn file, if present if has_syn: print_debug( "The input StarDict file contains a .syn file, parsing it...", args.debug) result = False syn_file_obj = io.open(extracted_files["d.syn"], "rb") byte_read = syn_file_obj.read(1) synonym = b"" while byte_read: if byte_read == b"\0": # end of current synonym: read index of original word index_bytes = syn_file_obj.read(4) index_int = int((struct.unpack('>i', index_bytes))[0]) synonym = synonym.decode("utf-8") if index_int < len(dictionary): dictionary.add_synonym(synonym=synonym, headword_index=index_int) else: # emit a warning? print_debug( "Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug) synonym = b"" else: # read next byte synonym += byte_read byte_read = syn_file_obj.read(1) syn_file_obj.close() result = True print_debug( "The input StarDict file contains a .syn file, parsing it... done", args.debug) else: print_debug("The input StarDict file does not contain a .syn file", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result