def uncompress_file(compressed_path, tmp_path, key): uncompressed_path = os.path.join(tmp_path, key) u_obj = open(uncompressed_path, "wb") c_obj = gzip.open(compressed_path, "rb") u_obj.write(c_obj.read()) c_obj.close() u_obj.close() print_debug("Uncompressed %s" % (uncompressed_path), args.debug) return uncompressed_path
def uncompress_file(compressed_path, tmp_path, key): uncompressed_path = os.path.join(tmp_path, key) u_obj = io.open(uncompressed_path, "wb") c_obj = gzip.open(compressed_path, "rb") u_obj.write(c_obj.read()) c_obj.close() u_obj.close() print_debug("Uncompressed %s" % (uncompressed_path), args.debug) return uncompressed_path
def read_ifo(ifo_path, has_syn, args): ifo_dict = {} ifo_obj = open(ifo_path, "rb") ifo_bytes = ifo_obj.read() # bytes ifo_unicode = ifo_bytes.decode("utf-8") # unicode, always utf-8 by spec ifo_obj.close() for line in ifo_unicode.splitlines(): array = line.split("=") if len(array) >= 2: key = array[0] val = "=".join(array[1:]) ifo_dict[key] = val if not "version" in ifo_dict: print_error("No 'version' found in the .ifo file (see StarDict spec)") return None if ifo_dict["version"] not in ["2.4.2", "3.0.0"]: print_error("The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)") return None required_keys = ["bookname", "wordcount", "idxfilesize"] if has_syn: required_keys.append("synwordcount") # TODO not used => disabling this #if ifo_dict["version"] == "3.0.0": # required_keys.append("idxoffsetbits") for key in required_keys: if not key in ifo_dict: print_error("No '%s' found in the .ifo file (see StarDict spec)" % key) return None ifo_dict["wordcount"] = int(ifo_dict["wordcount"]) ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"]) if has_syn: ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"]) # TODO not used => disabling this #if ifo_dict["version"] == "3.0.0": # ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"]) if args.sd_ignore_sametypesequence: print_debug("Ignoring sametypesequence value", args.debug) else: # TODO limitation: we require sametypesequence to be present if not "sametypesequence" in ifo_dict: print_error("The .ifo file must have a 'sametypesequence' value (see README).") return None # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES if not ifo_dict["sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES: print_error("The .ifo file must have a 'sametypesequence' value of %s (see README)." % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES)) return None return ifo_dict
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # sort by headword, optionally ignoring case dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case) # create groups special_group, group_keys, group_dict = dictionary.group( prefix_function_path=args.group_by_prefix_function, prefix_length=int(args.group_by_prefix_length), merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first) all_group_keys = group_keys if special_group is not None: all_group_keys += [u"SPECIAL"] # create epub object epub = DictionaryEbook(ebook_format=DictionaryEbook.EPUB2, args=args) # add groups for key in all_group_keys: if key == u"SPECIAL": group_entries = special_group else: group_entries = group_dict[key] epub.add_group(key, group_entries) # create output file if args.epub_no_compress: print_debug("Not compressing the EPUB container") epub.write(output_file_path_absolute, compress=False) else: print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) epub.write(output_file_path_absolute, compress=True) result = [output_file_path] print_debug( "Writing to file '%s'... done" % (output_file_path_absolute), args.debug) # delete tmp directory tmp_path = epub.get_tmp_path() if args.epub_no_compress: print_info("The uncompressed EPUB is inside dir '%s'" % (tmp_path)) result = [tmp_path] elif args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) if result is None: result = [tmp_path] else: epub.delete() print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # sort by headword, optionally ignoring case dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case) # create groups special_group, group_keys, group_dict = dictionary.group( prefix_function_path=args.group_by_prefix_function, prefix_length=int(args.group_by_prefix_length), merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first ) all_group_keys = group_keys if special_group is not None: all_group_keys += [u"SPECIAL"] # create epub object epub = DictionaryEbook(ebook_format=DictionaryEbook.EPUB2, args=args) # add groups for key in all_group_keys: if key == u"SPECIAL": group_entries = special_group else: group_entries = group_dict[key] epub.add_group(key, group_entries) # create output file if args.epub_no_compress: print_debug("Not compressing the EPUB container") epub.write(output_file_path_absolute, compress=False) else: print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) epub.write(output_file_path_absolute, compress=True) result = [output_file_path] print_debug("Writing to file '%s'... done" % (output_file_path_absolute), args.debug) # delete tmp directory tmp_path = epub.get_tmp_path() if args.epub_no_compress: print_info("The uncompressed EPUB is inside dir '%s'" % (tmp_path)) result = [tmp_path] elif args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) if result is None: result = [tmp_path] else: epub.delete() print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): csv_fs = escape(args.csv_fs) csv_ls = escape(args.csv_ls) try: print_debug("Writing to file '%s'..." % (output_file_path), args.debug) output_file_obj = io.open(output_file_path, "wb") for index in dictionary.entries_index_sorted: entry = dictionary.entries[index] string = u"%s%s%s%s" % (entry.headword, csv_fs, entry.definition, csv_ls) output_file_obj.write(string.encode("utf-8")) output_file_obj.close() print_debug("Writing to file '%s'... success" % (output_file_path), args.debug) return [output_file_path] except: print_error("Writing to file '%s'... failure" % (output_file_path)) return None
def write(dictionary, args, output_file_path): try: print_debug("Writing to file '%s'..." % (output_file_path), args.debug) output_file_obj = open(output_file_path, "wb") for index in dictionary.entries_index_sorted: entry = dictionary.entries[index] string = u"%s%s%s%s" % ( entry.headword, args.csv_fs, entry.definition, args.csv_ls ) output_file_obj.write(string.encode("utf-8")) output_file_obj.close() print_debug("Writing to file '%s'... success" % (output_file_path), args.debug) return [output_file_path] except: print_error("Writing to file '%s'... failure" % (output_file_path)) return None
def read(dictionary, args, input_file_paths): for input_file_path in input_file_paths: print_debug("Reading from file '%s'..." % (input_file_path), args.debug) input_file_object = io.open(input_file_path, "rb") data_bytes = input_file_object.read() # bytes input_file_object.close() root = etree.fromstring(data_bytes) for entry in root.iter("entry"): headword = None definition = None for child in entry: if child.tag == "key": headword = child.text if child.tag == "def": definition = child.text if (headword is not None) and (definition is not None): dictionary.add_entry(headword=headword, definition=definition) print_debug("Reading from file '%s'... success" % (input_file_path), args.debug) return dictionary
def read(dictionary, args, input_file_paths): for input_file_path in input_file_paths: print_debug("Reading from file '%s'..." % (input_file_path), args.debug) input_file_object = open(input_file_path, "rb") data_bytes = input_file_object.read() # bytes input_file_object.close() root = etree.fromstring(data_bytes) for entry in root.iter("entry"): headword = None definition = None for child in entry: if child.tag == "key": headword = child.text if child.tag == "def": definition = child.text if (headword is not None) and (definition is not None): dictionary.add_entry(headword=headword, definition=definition) print_debug("Reading from file '%s'... success" % (input_file_path), args.debug) return dictionary
def read(dictionary, args, input_file_paths): for input_file_path in input_file_paths: print_debug("Reading from file '%s'..." % (input_file_path), args.debug) input_file_object = open(input_file_path, "rb") data_bytes = input_file_object.read() # bytes data_unicode = data_bytes.decode(args.input_file_encoding) # unicode input_file_object.close() lines = data_unicode.split(args.csv_ls) if args.csv_ignore_first_line: lines = lines[1:] for line in lines: array = line.split(args.csv_fs) if len(array) >= 2: headword = array[0] definition = line[len(headword) + 1:] if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) print_debug("Reading from file '%s'... success" % (input_file_path), args.debug) return dictionary
def read(dictionary, args, input_file_paths): csv_fs = escape(args.csv_fs) csv_ls = escape(args.csv_ls) for input_file_path in input_file_paths: print_debug("Reading from file '%s'..." % (input_file_path), args.debug) input_file_object = io.open(input_file_path, "rb") data_bytes = input_file_object.read() # bytes data_unicode = data_bytes.decode(args.input_file_encoding) # unicode input_file_object.close() lines = data_unicode.split(csv_ls) if args.csv_ignore_first_line: lines = lines[1:] for line in lines: array = line.split(csv_fs) if len(array) >= 2: headword = array[0] definition = line[len(headword) + 1:] if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) print_debug("Reading from file '%s'... success" % (input_file_path), args.debug) return dictionary
def write(dictionary, args, output_file_path): try: print_debug("Creating XML tree...", args.debug) dictionary_elem = etree.Element("dictionary") for index in dictionary.entries_index_sorted: entry = dictionary.entries[index] entry_elem = etree.SubElement(dictionary_elem, "entry") key_elem = etree.SubElement(entry_elem, "key") key_elem.text = entry.headword def_elem = etree.SubElement(entry_elem, "def") def_elem.text = entry.definition tree = etree.ElementTree(dictionary_elem) print_debug("Creating XML tree... done", args.debug) print_debug("Writing to file '%s'..." % (output_file_path), args.debug) tree.write(output_file_path, pretty_print=True, xml_declaration=True) print_debug("Writing to file '%s'... success" % (output_file_path), args.debug) return [output_file_path] except: print_error("Writing to file '%s'... failure" % (output_file_path)) return None
def write(dictionary, args, output_file_path): try: print_debug("Creating XML tree...", args.debug) dictionary_elem = etree.Element("dictionary") for index in dictionary.entries_index_sorted: entry = dictionary.entries[index] entry_elem = etree.SubElement(dictionary_elem, "entry") key_elem = etree.SubElement(entry_elem, "key") key_elem.text = entry.headword def_elem = etree.SubElement(entry_elem, "def") def_elem.text = entry.definition tree = etree.ElementTree(dictionary_elem) print_debug("Creating XML tree... done", args.debug) print_debug("Writing to file '%s'..." % (output_file_path), args.debug) tree.write( output_file_path, pretty_print=True, xml_declaration=True ) print_debug("Writing to file '%s'... success" % (output_file_path), args.debug) return [output_file_path] except: print_error("Writing to file '%s'... failure" % (output_file_path)) return None
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename and compute output file paths base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] ifo_file_path = base + ".ifo" idx_file_path = base + ".idx" dict_file_path = base + ".dict" dict_dz_file_path = base + ".dict.dz" syn_file_path = base + ".syn" # TODO by spec, the index should be sorted # TODO using the comparator stardict_strcmp() defined in the spec # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ), # TODO or with a user-defined collation function # # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2); # Compare two strings, ignoring the case of ASCII characters. # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters. # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches. # # using Python's builtin lower() and sort() by headword # should be equivalent for UTF-8 encoded dictionaries (and it is fast) # dictionary.sort(by_headword=True, ignore_case=True) # write .idx and .dict files print_debug("Writing .idx and .dict files...", args.debug) idx_file_obj = io.open(idx_file_path, "wb") dict_file_obj = io.open(dict_file_path, "wb") current_offset = 0 current_idx_size = 0 for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] headword_bytes = entry.headword.encode("utf-8") definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) # write .idx idx_file_obj.write(headword_bytes) idx_file_obj.write(b"\0") idx_file_obj.write(struct.pack('>i', current_offset)) idx_file_obj.write(struct.pack('>i', definition_size)) current_idx_size += (len(headword_bytes) + 1 + 4 + 4) # write .dict dict_file_obj.write(definition_bytes) current_offset += definition_size idx_file_obj.close() dict_file_obj.close() print_debug("Writing .idx and .dict files... done", args.debug) # list files to compress files_to_compress = [] files_to_compress.append(ifo_file_path) files_to_compress.append(idx_file_path) # write .syn file dict_syns_len = 0 if dictionary.has_synonyms: if args.ignore_synonyms: print_debug("Dictionary has synonyms, but ignoring them", args.debug) else: print_debug("Dictionary has synonyms, writing .syn file...", args.debug) syn_file_obj = io.open(syn_file_path, "wb") dict_syns = dictionary.get_synonyms() dict_syns_len = len(dict_syns) for pair in dict_syns: synonym_bytes = pair[0].encode("utf-8") index = pair[1] syn_file_obj.write(synonym_bytes) syn_file_obj.write(b"\0") syn_file_obj.write(struct.pack('>i', index)) syn_file_obj.close() files_to_compress.append(syn_file_path) print_debug("Dictionary has synonyms, writing .syn file... done", args.debug) # compress .dict file if args.sd_no_dictzip: print_debug("Not compressing .dict file with dictzip", args.debug) files_to_compress.append(dict_file_path) result = [dict_file_path] else: try: print_debug("Compressing .dict file with dictzip...", args.debug) dictzip_path = DICTZIP if args.dictzip_path is None: print_info(" Running '%s' from $PATH" % DICTZIP) else: dictzip_path = args.dictzip_path print_info(" Running '%s' from '%s'" % (DICTZIP, dictzip_path)) proc = subprocess.Popen([dictzip_path, "-k", dict_file_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() result = [dict_dz_file_path] files_to_compress.append(dict_dz_file_path) print_debug("Compressing .dict file with dictzip... done", args.debug) except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (DICTZIP, dictzip_path)) print_error(" Please make sure '%s':" % DICTZIP) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --dictzip-path or") print_error( " 3. specify --no-dictzip to avoid compressing the .dict file" ) result = None if result is not None: # create ifo file ifo_file_obj = io.open(ifo_file_path, "wb") ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8")) ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8")) ifo_file_obj.write( (u"wordcount=%d\n" % (len(dictionary))).encode("utf-8")) ifo_file_obj.write( (u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8")) ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8")) ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8")) ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8")) ifo_file_obj.write( (u"description=%s\n" % (args.description)).encode("utf-8")) ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8")) ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8")) ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8")) if dict_syns_len > 0: ifo_file_obj.write( (u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8")) ifo_file_obj.close() # create output zip file try: print_debug( "Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) print_debug("Written %s" % (file_to_compress), args.debug) file_zip_obj.close() result = [output_file_path] print_debug( "Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def main(): parser = argparse.ArgumentParser( usage=USAGE, description=DESCRIPTION, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter) for param in COMMAND_LINE_PARAMETERS: if param["short"] is None: parser.add_argument(param["long"], help=param["help"], action=param["action"], default=argparse.SUPPRESS) else: parser.add_argument(param["short"], param["long"], help=param["help"], action=param["action"], default=argparse.SUPPRESS) arguments = parser.parse_args() # no arguments: show help and exit if len(sys.argv) < 2: parser.print_help() sys.exit(0) # print version and exit if "version" in arguments: print_info("Penelope v%s" % (__version__)) sys.exit(0) # check we have all the required arguments # if not, it will sys.exit() with some error code check_arguments(arguments) # set default values set_default_values(arguments) print_debug( u"Running with the command line arguments:\n%s" % (str(arguments)), arguments.debug) # read raw dictionary print_info(u"Reading input file(s)...") dictionary = read_dictionary(arguments) if dictionary is None: print_error("Unable to read the input file(s)") sys.exit(8) print_info(u"Reading input file(s)... done") # apply custom input parser, if specified if arguments.input_parser is not None: input_parser = load_input_parser(arguments.input_parser) if input_parser is not None: print_info(u"Applying the specified input parser...") dictionary = input_parser.parse(dictionary, arguments) print_info(u"Applying the specified input parser... done") # sort dictionary before, if requested if arguments.sort_before: print_info(u"Sorting before...") dictionary.sort(arguments.sort_by_headword, arguments.sort_by_definition, arguments.sort_reverse, arguments.sort_ignore_case) print_info(u"Sorting before... done") # merge definitions, if requested if arguments.merge_definitions: print_info(u"Merging...") dictionary.merge_definitions(merge_separator=arguments.merge_separator) print_info(u"Merging... done") # flatten synonyms, if requested if arguments.flatten_synonyms: print_info(u"Flattening synonyms...") dictionary.flatten_synonyms() print_info(u"Flattening synonyms... done") # sort dictionary after, if requested if arguments.sort_after: print_info(u"Sorting after...") dictionary.sort(arguments.sort_by_headword, arguments.sort_by_definition, arguments.sort_reverse, arguments.sort_ignore_case) print_info(u"Sorting after... done") # output dictionary print_info(u"Writing output file(s)...") output_paths = write_dictionary(dictionary, arguments) if output_paths is None: print_error("Unable to write the output file(s)") sys.exit(16) print_info(u"Writing output file(s)... done") print_info(u"The following file(s) have been created:") for op in output_paths: print_info(u" %s" % op) sys.exit(0)
def read(dictionary, args, input_file_paths): def find_files(entries): found = {} for entry in entries: if entry.endswith(".ifo"): found["d.ifo"] = entry break if "d.ifo" not in found: print_error( "Cannot find .ifo file in the given StarDict file (see StarDict spec)" ) return {} # remove .ifo extension base = found["d.ifo"][:-4] # attempt to find these ones tentative_idx = base + ".idx" tentative_idx_gz = base + ".idx.gz" tentative_dict = base + ".dict" tentative_dict_dz = base + ".dict.dz" tentative_dz = base + ".dz" if tentative_idx in entries: found["d.idx"] = tentative_idx if tentative_idx_gz in entries: found["d.idx.gz"] = tentative_idx_gz if not (("d.idx" in found) or ("d.idx.gz" in found)): print_error( "Cannot find .idx or .idx.gz file in the given StarDict file (see StarDict spec)" ) return {} if tentative_dict in entries: found["d.dict"] = tentative_dict if tentative_dict_dz in entries: found["d.dict.dz"] = tentative_dict_dz if tentative_dz in entries: found["d.dz"] = tentative_dz if not (("d.dict" in found) or ("d.dict.dz" in found) or ("d.dz" in found)): print_error( "Cannot find .dict, .dict.dz, or .dz file in the given StarDict file (see StarDict spec)" ) return {} # syn is optional tentative_syn = base + ".syn" if tentative_syn in entries: found["d.syn"] = tentative_syn return found def uncompress_file(compressed_path, tmp_path, key): uncompressed_path = os.path.join(tmp_path, key) u_obj = io.open(uncompressed_path, "wb") c_obj = gzip.open(compressed_path, "rb") u_obj.write(c_obj.read()) c_obj.close() u_obj.close() print_debug("Uncompressed %s" % (uncompressed_path), args.debug) return uncompressed_path def read_ifo(ifo_path, has_syn, args): ifo_dict = {} ifo_obj = io.open(ifo_path, "rb") ifo_bytes = ifo_obj.read() # bytes ifo_unicode = ifo_bytes.decode( "utf-8") # unicode, always utf-8 by spec ifo_obj.close() for line in ifo_unicode.splitlines(): array = line.split("=") if len(array) >= 2: key = array[0] val = "=".join(array[1:]) ifo_dict[key] = val if "version" not in ifo_dict: print_error( "No 'version' found in the .ifo file (see StarDict spec)") return None if ifo_dict["version"] not in ["2.4.2", "3.0.0"]: print_error( "The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)" ) return None required_keys = ["bookname", "wordcount", "idxfilesize"] if has_syn: required_keys.append("synwordcount") # TODO not used => disabling this # if ifo_dict["version"] == "3.0.0": # required_keys.append("idxoffsetbits") for key in required_keys: if key not in ifo_dict: print_error( "No '%s' found in the .ifo file (see StarDict spec)" % key) return None ifo_dict["wordcount"] = int(ifo_dict["wordcount"]) ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"]) if has_syn: ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"]) # TODO not used => disabling this # if ifo_dict["version"] == "3.0.0": # ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"]) if args.sd_ignore_sametypesequence: print_debug("Ignoring sametypesequence value", args.debug) else: # TODO limitation: we require sametypesequence to be present if "sametypesequence" not in ifo_dict: print_error( "The .ifo file must have a 'sametypesequence' value (see README)." ) return None # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES if not ifo_dict[ "sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES: print_error( "The .ifo file must have a 'sametypesequence' value of %s (see README)." % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES)) return None return ifo_dict def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) # find .ifo, .idx, .dict[.dz] and .syn files inside the zip # and extract them to tmp_path input_file_obj = zipfile.ZipFile(input_file_path) found_files = find_files(input_file_obj.namelist()) extracted_files = {} if len(found_files) > 0: for key in found_files: entry = found_files[key] ext_file_path = os.path.join(tmp_path, key) ext_file_obj = io.open(ext_file_path, "wb") zip_entry = input_file_obj.open(entry) ext_file_obj.write(zip_entry.read()) zip_entry.close() ext_file_obj.close() print_debug("Extracted %s" % (ext_file_path), args.debug) extracted_files[key] = ext_file_path # extract from compressed file, but only if ".idx" is not present as well if (key == "d.idx.gz") and ("d.idx" not in found_files): extracted_files["d.idx"] = uncompress_file( ext_file_path, tmp_path, "d.idx") # extract from compressed file, but only if ".dict" is not present as well if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files): extracted_files["d.dict"] = uncompress_file( ext_file_path, tmp_path, "d.dict") input_file_obj.close() # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn has_syn = "d.syn" in extracted_files if (has_syn) and (args.ignore_synonyms): has_syn = False print_debug( "Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug) ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args) print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug) # read dict file dict_file_obj = io.open(extracted_files["d.dict"], "rb") dict_file_bytes = dict_file_obj.read() dict_file_obj.close() # read idx file idx_file_obj = io.open(extracted_files["d.idx"], "rb") byte_read = idx_file_obj.read(1) headword = b"" while byte_read: if byte_read == b"\0": # end of current word: read offset and size offset_bytes = idx_file_obj.read(4) offset_int = int((struct.unpack('>i', offset_bytes))[0]) size_bytes = idx_file_obj.read(4) size_int = int((struct.unpack('>i', size_bytes))[0]) definition = dict_file_bytes[offset_int:( offset_int + size_int)].decode(args.input_file_encoding) headword = headword.decode("utf-8") if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) headword = b"" else: # read next byte headword += byte_read byte_read = idx_file_obj.read(1) idx_file_obj.close() result = True # read syn file, if present if has_syn: print_debug( "The input StarDict file contains a .syn file, parsing it...", args.debug) result = False syn_file_obj = io.open(extracted_files["d.syn"], "rb") byte_read = syn_file_obj.read(1) synonym = b"" while byte_read: if byte_read == b"\0": # end of current synonym: read index of original word index_bytes = syn_file_obj.read(4) index_int = int((struct.unpack('>i', index_bytes))[0]) synonym = synonym.decode("utf-8") if index_int < len(dictionary): dictionary.add_synonym(synonym=synonym, headword_index=index_int) else: # emit a warning? print_debug( "Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug) synonym = b"" else: # read next byte synonym += byte_read byte_read = syn_file_obj.read(1) syn_file_obj.close() result = True print_debug( "The input StarDict file contains a .syn file, parsing it... done", args.debug) else: print_debug("The input StarDict file does not contain a .syn file", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result for input_file_path in input_file_paths: print_debug("Reading from file '%s'..." % (input_file_path), args.debug) result = read_single_file(dictionary, args, input_file_path) if result: print_debug( "Reading from file '%s'... success" % (input_file_path), args.debug) else: print_error("Reading from file '%s'... failed" % (input_file_path)) return None return dictionary
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # sort by headword, optionally ignoring case dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case) # create groups special_group, group_keys, group_dict = dictionary.group( prefix_function_path=args.group_by_prefix_function, prefix_length=int(args.group_by_prefix_length), merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first) all_group_keys = group_keys if special_group is not None: all_group_keys += [u"SPECIAL"] # create mobi object mobi = DictionaryEbook(ebook_format=DictionaryEbook.MOBI, args=args) # add groups for key in all_group_keys: if key == u"SPECIAL": group_entries = special_group else: group_entries = group_dict[key] mobi.add_group(key, group_entries) # create output file print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) mobi.write(output_file_path_absolute, compress=False) result = [output_file_path] print_debug("Writing to file '%s'... done" % (output_file_path_absolute), args.debug) # run kindlegen tmp_path = mobi.get_tmp_path() if args.mobi_no_kindlegen: print_info("Not running kindlegen, the raw files are located in '%s'" % tmp_path) result = [tmp_path] else: try: print_debug("Creating .mobi file with kindlegen...", args.debug) kindlegen_path = KINDLEGEN opf_file_path_absolute = os.path.join(tmp_path, "OEBPS", "content.opf") mobi_file_path_relative = u"content.mobi" mobi_file_path_absolute = os.path.join(tmp_path, "OEBPS", mobi_file_path_relative) if args.kindlegen_path is None: print_info(" Running '%s' from $PATH" % KINDLEGEN) else: kindlegen_path = args.kindlegen_path print_info(" Running '%s' from '%s'" % (KINDLEGEN, kindlegen_path)) proc = subprocess.Popen([ kindlegen_path, opf_file_path_absolute, "-o", mobi_file_path_relative ], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) output = proc.communicate() if args.debug: output_unicode = (output[0]).decode("utf-8") print_debug(output_unicode, args.debug) copy_file(mobi_file_path_absolute, output_file_path_absolute) result = [output_file_path] print_debug("Creating .mobi file with kindlegen... done", args.debug) except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (KINDLEGEN, kindlegen_path)) print_error(" Please make sure '%s':" % KINDLEGEN) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --kindlegen-path") # delete tmp directory tmp_path = mobi.get_tmp_path() if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: mobi.delete() print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_dict(dictionary, args, single_dict): # create tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) if len(single_dict) == 1: print_debug("Unzipping .install file...", args.debug) zip_file_path = single_dict[0] idx_file_path = os.path.join(tmp_path, "d.dict.idx") dict_file_path = os.path.join(tmp_path, "d.dict") zip_file_obj = zipfile.ZipFile(zip_file_path, "r") for entry in zip_file_obj.namelist(): if entry.endswith(".dict.idx"): zip_entry = zip_file_obj.open(entry) idx_file_obj = open(idx_file_path, "wb") idx_file_obj.write(zip_entry.read()) idx_file_obj.close() zip_entry.close() elif entry.endswith(".dict"): zip_entry = zip_file_obj.open(entry) dict_file_obj = open(dict_file_path, "wb") dict_file_obj.write(zip_entry.read()) dict_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .install file... done", args.debug) else: print_debug("Files .dict.idx and .dict already uncompressed...", args.debug) idx_file_path = single_dict[0] dict_file_path = single_dict[1] for file_path in [idx_file_path, dict_file_path]: if not os.path.exists(file_path): print_error("File '%s' does not exist" % file_path) return False print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug) # unzip .dict file into tmp_path print_debug("Unzipping .dict file...", args.debug) zip_file_obj = zipfile.ZipFile(dict_file_path, "r") for entry in zip_file_obj.namelist(): if not entry.endswith("/"): zip_entry = zip_file_obj.open(entry) entry_file_path = os.path.join(tmp_path, os.path.basename(entry)) entry_file_obj = open(entry_file_path, "wb") entry_file_obj.write(zip_entry.read()) entry_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .dict file... done", args.debug) # read .dict.idx print_debug("Reading .dict.idx file...", args.debug) sql_connection = sqlite3.connect(idx_file_path) sql_cursor = sql_connection.cursor() sql_cursor.execute("select * from T_DictIndex") index_data = sql_cursor.fetchall() chunk_index_to_entries = {} max_chunk_index = 1 for index_entry in index_data: headword = index_entry[1] if args.ignore_case: headword = headword.lower() offset = index_entry[2] size = index_entry[3] chunk_index = index_entry[4] if not chunk_index in chunk_index_to_entries: chunk_index_to_entries[chunk_index] = [] if chunk_index > max_chunk_index: max_chunk_index = chunk_index chunk_index_to_entries[chunk_index].append([headword, offset, size]) sql_cursor.close() sql_connection.close() print_debug("Reading .dict.idx file... done", args.debug) # read c_* files print_debug("Reading c_* files...", args.debug) for chunk_index in range(1, max_chunk_index + 1): print_debug(" Reading c_%d file..." % (chunk_index), args.debug) chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)) chunk_file_obj = open(chunk_file_path, "rb") for entry in chunk_index_to_entries[chunk_index]: headword = entry[0] offset = entry[1] size = entry[2] chunk_file_obj.seek(offset) definition_bytes = chunk_file_obj.read(size) definition_unicode = definition_bytes.decode(args.input_file_encoding) dictionary.add_entry(headword=headword, definition=definition_unicode) chunk_file_obj.close() print_debug(" Reading c_%d file... done" % (chunk_index), args.debug) print_debug("Reading c_* files... done", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return True
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # get absolute path for collation function file bookeen_collation_function_path = None if args.bookeen_collation_function is not None: bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] # copy empty.idx into tmp_path idx_file_path = base + u".dict.idx" dict_file_path = base + u".dict" copy_file(EMPTY_FILE_PATH, idx_file_path) # open index sql_connection = sqlite3.connect(idx_file_path) # install collation in the index collation_function = collate_function_default if bookeen_collation_function_path is not None: try: collation_function = imp.load_source("", bookeen_collation_function_path).collate_function print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug) except: print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path)) sql_connection.create_collation("IcuNoCase", collation_function) sql_connection.text_factory = str # get a cursor and delete any data from the index file sql_cursor = sql_connection.cursor() sql_cursor.execute("delete from T_DictIndex") # write c_* files # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively) print_debug("Writing c_* files...", args.debug) files_to_compress = [] current_offset = 0 chunk_index = 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = open(chunk_file_path, "wb") for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) chunk_file_obj.write(definition_bytes) # insert headword into index file sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # insert synonyms into index file if not args.ignore_synonyms: for synonym in entry.get_synonyms(): sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # update offset current_offset += definition_size # if we reached CHUNK_SIZE, open the next c_* file if current_offset > CHUNK_SIZE: chunk_file_obj.close() chunk_index += 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = open(chunk_file_path, "wb") current_offset = 0 chunk_file_obj.close() print_debug("Writing c_* files... done", args.debug) # compress print_debug("Compressing c_* files...", args.debug) file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() print_debug("Compressing c_* files... done", args.debug) # update index metadata print_debug("Updating index metadata...", args.debug) header = HEADER % (args.language_from) sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,)) sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,)) sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,)) sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,)) sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,)) sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,)) sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,)) sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,)) # the meaning of the following is unknown sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",)) sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",)) sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",)) sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",)) print_debug("Updating index metadata... done", args.debug) # compact and close sql_cursor.execute("vacuum") sql_cursor.close() sql_connection.close() # create .install file or copy .dict.idx and .dict into requested output directory parent_output_directory = os.path.split(output_file_path_absolute)[0] if args.bookeen_install_file: print_debug("Creating .install file...", args.debug) file_zip_path = os.path.join(parent_output_directory, base + u".install") file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in [dict_file_path, idx_file_path]: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [file_zip_path] print_debug("Creating .install file... done", args.debug) else: print_debug("Copying .dict.idx and .dict files...", args.debug) dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path)) idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path)) copy_file(dict_file_path, dict_file_path_final) copy_file(idx_file_path, idx_file_path_final) result = [idx_file_path_final, dict_file_path_final] print_debug("Copying .dict.idx and .dict files... done", args.debug) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # sort by headword dictionary.sort(by_headword=True) # group by prefix files_to_compress = [] prefix_length = int(args.group_by_prefix_length) special_group, group_keys, group_dict = dictionary.group( prefix_function=get_prefix_kobo, prefix_length=prefix_length, merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first ) if special_group is not None: special_group_key = u"1" * prefix_length group_dict[special_group_key] = special_group group_keys = [special_group_key] + group_keys # write files for key in group_keys: # write html file file_html_path = key + u".html" file_html_obj = io.open(file_html_path, "wb") file_html_obj.write(u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode("utf-8")) for entry in group_dict[key]: headword = entry.headword definition = entry.definition file_html_obj.write((u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" % (headword, headword, definition)).encode("utf-8")) file_html_obj.write((u"</html>").encode("utf-8")) file_html_obj.close() # compress in gz format file_html_obj = io.open(file_html_path, "rb") file_gz_path = file_html_path + u".gz" file_gz_obj = gzip.open(file_gz_path, "wb") file_gz_obj.writelines(file_html_obj) file_gz_obj.close() file_html_obj.close() # delete .html file delete_file(None, file_html_path) # rename .html.gz file into .html rename_file(file_gz_path, file_html_path) files_to_compress.append(file_html_path) # write words file_words_path = WORDS_FILE_NAME keys = sorted(dictionary.entries_index.keys()) try: import marisa_trie trie = marisa_trie.Trie(keys) trie.save(file_words_path) result = [file_words_path] except ImportError as exc: # call MARISA with subprocess print_info(" MARISA cannot be imported as Python module. You might want to install it with:") print_info(" $ [sudo] pip install marisa_trie") marisa_build_path = MARISA_BUILD if args.marisa_bin_path is None: print_info(" Running '%s' from $PATH" % MARISA_BUILD) else: marisa_build_path = os.path.join(args.marisa_bin_path, MARISA_BUILD) print_info(" Running '%s' from '%s'" % (MARISA_BUILD, args.marisa_bin_path)) # TODO this is ugly, but it works query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8") try: proc = subprocess.Popen( [marisa_build_path, "-l", "-o", file_words_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE ) proc.communicate(input=query)[0].decode("utf-8") result = [file_words_path] except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (MARISA_BUILD, marisa_build_path)) print_error(" Please make sure '%s':" % MARISA_BUILD) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --marisa-bin-path or") print_error(" 3. install the marisa_trie Python module") result = None if result is not None: # add file_words_path to files to compress files_to_compress.append(file_words_path) # create output zip file try: print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [output_file_path] print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp file tmp_handler, tmp_path = create_temp_file() # copy the index file from the zip to the tmp file input_file_obj = zipfile.ZipFile(input_file_path) tmp_file_obj = io.open(tmp_path, "wb") tmp_file_obj.write(input_file_obj.read(WORDS_FILE_NAME)) tmp_file_obj.close() input_file_obj.close() # read index with MARISA try: # call MARISA with marisa_trie module import marisa_trie trie = marisa_trie.Trie() trie.load(tmp_path) for pair in trie.items(): dictionary.add_entry(headword=pair[0], definition=u"") result = True except ImportError as exc: # call MARISA with subprocess print_info(" MARISA cannot be imported as Python module. You might want to install it with:") print_info(" $ [sudo] pip install marisa_trie") marisa_reverse_lookup_path = MARISA_REVERSE_LOOKUP if args.marisa_bin_path is None: print_info(" Running '%s' from $PATH" % MARISA_REVERSE_LOOKUP) else: marisa_reverse_lookup_path = os.path.join(args.marisa_bin_path, MARISA_REVERSE_LOOKUP) print_info(" Running '%s' from '%s'" % (MARISA_REVERSE_LOOKUP, args.marisa_bin_path)) # TODO this is ugly, but it works query = (u"\n".join([str(x) for x in range(int(args.marisa_index_size))]) + u"\n").encode("utf-8") try: proc = subprocess.Popen( [marisa_reverse_lookup_path, tmp_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE ) stdout = proc.communicate(input=query)[0].decode("utf-8") for line in stdout.splitlines(): array = line.split("\t") if len(array) >= 2: key = array[1] if args.ignore_case: key = key.lower() dictionary.add_entry(headword=key, definition=u"") result = True except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (MARISA_REVERSE_LOOKUP, marisa_reverse_lookup_path)) print_error(" Please make sure '%s':" % MARISA_REVERSE_LOOKUP) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --marisa-bin-path or") print_error(" 3. install the marisa_trie Python module") except: print_debug("Reading from file '%s'... failed" % (input_file_path)) # delete the tmp file delete_file(tmp_handler, tmp_path) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # sort by headword dictionary.sort(by_headword=True) # group by prefix files_to_compress = [] prefix_length = int(args.group_by_prefix_length) special_group, group_keys, group_dict = dictionary.group( prefix_function=get_prefix_kobo, prefix_length=prefix_length, merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first) if special_group is not None: special_group_key = u"1" * prefix_length group_dict[special_group_key] = special_group group_keys = [special_group_key] + group_keys # write files for key in group_keys: # write html file file_html_path = key + u".html" file_html_obj = io.open(file_html_path, "wb") file_html_obj.write( u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode( "utf-8")) for entry in group_dict[key]: headword = entry.headword definition = entry.definition file_html_obj.write( (u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" % (headword, headword, definition)).encode("utf-8")) file_html_obj.write((u"</html>").encode("utf-8")) file_html_obj.close() # compress in gz format file_html_obj = io.open(file_html_path, "rb") file_gz_path = file_html_path + u".gz" file_gz_obj = gzip.open(file_gz_path, "wb") file_gz_obj.writelines(file_html_obj) file_gz_obj.close() file_html_obj.close() # delete .html file delete_file(None, file_html_path) # rename .html.gz file into .html rename_file(file_gz_path, file_html_path) files_to_compress.append(file_html_path) # write words file_words_path = WORDS_FILE_NAME keys = sorted(dictionary.entries_index.keys()) try: import marisa_trie trie = marisa_trie.Trie(keys) trie.save(file_words_path) result = [file_words_path] except ImportError as exc: # call MARISA with subprocess print_info( " MARISA cannot be imported as Python module. You might want to install it with:" ) print_info(" $ [sudo] pip install marisa_trie") marisa_build_path = MARISA_BUILD if args.marisa_bin_path is None: print_info(" Running '%s' from $PATH" % MARISA_BUILD) else: marisa_build_path = os.path.join(args.marisa_bin_path, MARISA_BUILD) print_info(" Running '%s' from '%s'" % (MARISA_BUILD, args.marisa_bin_path)) # TODO this is ugly, but it works query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8") try: proc = subprocess.Popen( [marisa_build_path, "-l", "-o", file_words_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate(input=query)[0].decode("utf-8") result = [file_words_path] except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (MARISA_BUILD, marisa_build_path)) print_error(" Please make sure '%s':" % MARISA_BUILD) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --marisa-bin-path or") print_error(" 3. install the marisa_trie Python module") result = None if result is not None: # add file_words_path to files to compress files_to_compress.append(file_words_path) # create output zip file try: print_debug( "Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [output_file_path] print_debug( "Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def main(): parser = argparse.ArgumentParser( usage=USAGE, description=DESCRIPTION, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter ) for param in COMMAND_LINE_PARAMETERS: if param["short"] is None: parser.add_argument( param["long"], help=param["help"], action=param["action"], default=argparse.SUPPRESS ) else: parser.add_argument( param["short"], param["long"], help=param["help"], action=param["action"], default=argparse.SUPPRESS ) arguments = parser.parse_args() # no arguments: show help and exit if len(sys.argv) < 2: parser.print_help() sys.exit(0) # print version and exit if "version" in arguments: print_info("Penelope v%s" % (__version__)) sys.exit(0) # check we have all the required arguments # if not, it will sys.exit() with some error code check_arguments(arguments) # set default values set_default_values(arguments) print_debug(u"Running with the command line arguments:\n%s" % (str(arguments)), arguments.debug) # read raw dictionary print_info(u"Reading input file(s)...") dictionary = read_dictionary(arguments) if dictionary is None: print_error("Unable to read the input file(s)") sys.exit(8) print_info(u"Reading input file(s)... done") # apply custom input parser, if specified if arguments.input_parser is not None: input_parser = load_input_parser(arguments.input_parser) if input_parser is not None: print_info(u"Applying the specified input parser...") dictionary = input_parser.parse(dictionary, arguments) print_info(u"Applying the specified input parser... done") # sort dictionary before, if requested if arguments.sort_before: print_info(u"Sorting before...") dictionary.sort( arguments.sort_by_headword, arguments.sort_by_definition, arguments.sort_reverse, arguments.sort_ignore_case ) print_info(u"Sorting before... done") # merge definitions, if requested if arguments.merge_definitions: print_info(u"Merging...") dictionary.merge_definitions(merge_separator=arguments.merge_separator) print_info(u"Merging... done") # flatten synonyms, if requested if arguments.flatten_synonyms: print_info(u"Flattening synonyms...") dictionary.flatten_synonyms() print_info(u"Flattening synonyms... done") # sort dictionary after, if requested if arguments.sort_after: print_info(u"Sorting after...") dictionary.sort( arguments.sort_by_headword, arguments.sort_by_definition, arguments.sort_reverse, arguments.sort_ignore_case ) print_info(u"Sorting after... done") # output dictionary print_info(u"Writing output file(s)...") output_paths = write_dictionary(dictionary, arguments) if output_paths is None: print_error("Unable to write the output file(s)") sys.exit(16) print_info(u"Writing output file(s)... done") print_info(u"The following file(s) have been created:") for op in output_paths: print_info(u" %s" % op) sys.exit(0)
def read(dictionary, args, input_file_paths): def find_files(entries): found = {} for entry in entries: if entry.endswith(".ifo"): found["d.ifo"] = entry break if not "d.ifo" in found: print_error("Cannot find .ifo file in the given StarDict file (see StarDict spec)") return {} # remove .ifo extension base = found["d.ifo"][:-4] # attempt to find these ones tentative_idx = base + ".idx" tentative_idx_gz = base + ".idx.gz" tentative_dict = base + ".dict" tentative_dict_dz = base + ".dict.dz" tentative_dz = base + ".dz" if tentative_idx in entries: found["d.idx"] = tentative_idx if tentative_idx_gz in entries: found["d.idx.gz"] = tentative_idx_gz if not (("d.idx" in found) or ("d.idx.gz" in found)): print_error("Cannot find .idx or .idx.gz file in the given StarDict file (see StarDict spec)") return {} if tentative_dict in entries: found["d.dict"] = tentative_dict if tentative_dict_dz in entries: found["d.dict.dz"] = tentative_dict_dz if tentative_dz in entries: found["d.dz"] = tentative_dz if not (("d.dict" in found) or ("d.dict.dz" in found) or ("d.dz" in found)): print_error("Cannot find .dict, .dict.dz, or .dz file in the given StarDict file (see StarDict spec)") return {} # syn is optional tentative_syn = base + ".syn" if tentative_syn in entries: found["d.syn"] = tentative_syn return found def uncompress_file(compressed_path, tmp_path, key): uncompressed_path = os.path.join(tmp_path, key) u_obj = open(uncompressed_path, "wb") c_obj = gzip.open(compressed_path, "rb") u_obj.write(c_obj.read()) c_obj.close() u_obj.close() print_debug("Uncompressed %s" % (uncompressed_path), args.debug) return uncompressed_path def read_ifo(ifo_path, has_syn, args): ifo_dict = {} ifo_obj = open(ifo_path, "rb") ifo_bytes = ifo_obj.read() # bytes ifo_unicode = ifo_bytes.decode("utf-8") # unicode, always utf-8 by spec ifo_obj.close() for line in ifo_unicode.splitlines(): array = line.split("=") if len(array) >= 2: key = array[0] val = "=".join(array[1:]) ifo_dict[key] = val if not "version" in ifo_dict: print_error("No 'version' found in the .ifo file (see StarDict spec)") return None if ifo_dict["version"] not in ["2.4.2", "3.0.0"]: print_error("The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)") return None required_keys = ["bookname", "wordcount", "idxfilesize"] if has_syn: required_keys.append("synwordcount") # TODO not used => disabling this #if ifo_dict["version"] == "3.0.0": # required_keys.append("idxoffsetbits") for key in required_keys: if not key in ifo_dict: print_error("No '%s' found in the .ifo file (see StarDict spec)" % key) return None ifo_dict["wordcount"] = int(ifo_dict["wordcount"]) ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"]) if has_syn: ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"]) # TODO not used => disabling this #if ifo_dict["version"] == "3.0.0": # ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"]) if args.sd_ignore_sametypesequence: print_debug("Ignoring sametypesequence value", args.debug) else: # TODO limitation: we require sametypesequence to be present if not "sametypesequence" in ifo_dict: print_error("The .ifo file must have a 'sametypesequence' value (see README).") return None # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES if not ifo_dict["sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES: print_error("The .ifo file must have a 'sametypesequence' value of %s (see README)." % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES)) return None return ifo_dict def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) # find .ifo, .idx, .dict[.dz] and .syn files inside the zip # and extract them to tmp_path input_file_obj = zipfile.ZipFile(input_file_path) found_files = find_files(input_file_obj.namelist()) extracted_files = {} if len(found_files) > 0: for key in found_files: entry = found_files[key] ext_file_path = os.path.join(tmp_path, key) ext_file_obj = open(ext_file_path, "wb") zip_entry = input_file_obj.open(entry) ext_file_obj.write(zip_entry.read()) zip_entry.close() ext_file_obj.close() print_debug("Extracted %s" % (ext_file_path), args.debug) extracted_files[key] = ext_file_path # extract from compressed file, but only if ".idx" is not present as well if (key == "d.idx.gz") and ("d.idx" not in found_files): extracted_files["d.idx"] = uncompress_file(ext_file_path, tmp_path, "d.idx") # extract from compressed file, but only if ".dict" is not present as well if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files): extracted_files["d.dict"] = uncompress_file(ext_file_path, tmp_path, "d.dict") input_file_obj.close() # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn has_syn = "d.syn" in extracted_files if (has_syn) and (args.ignore_synonyms): has_syn = False print_debug("Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug) ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args) print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug) # read dict file dict_file_obj = open(extracted_files["d.dict"], "rb") dict_file_bytes = dict_file_obj.read() dict_file_obj.close() # read idx file idx_file_obj = open(extracted_files["d.idx"], "rb") byte_read = idx_file_obj.read(1) headword = b"" while byte_read: if byte_read == b"\0": # end of current word: read offset and size offset_bytes = idx_file_obj.read(4) offset_int = int((struct.unpack('>i', offset_bytes))[0]) size_bytes = idx_file_obj.read(4) size_int = int((struct.unpack('>i', size_bytes))[0]) definition = dict_file_bytes[offset_int:offset_int+size_int].decode(args.input_file_encoding) headword = headword.decode("utf-8") if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) headword = b"" else: # read next byte headword += byte_read byte_read = idx_file_obj.read(1) idx_file_obj.close() result = True # read syn file, if present if has_syn: print_debug("The input StarDict file contains a .syn file, parsing it...", args.debug) result = False syn_file_obj = open(extracted_files["d.syn"], "rb") byte_read = syn_file_obj.read(1) synonym = b"" while byte_read: if byte_read == b"\0": # end of current synonym: read index of original word index_bytes = syn_file_obj.read(4) index_int = int((struct.unpack('>i', index_bytes))[0]) synonym = synonym.decode("utf-8") if index_int < len(dictionary): dictionary.add_synonym(synonym=synonym, headword_index=index_int) else: # emit a warning? print_debug("Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug) synonym = b"" else: # read next byte synonym += byte_read byte_read = syn_file_obj.read(1) syn_file_obj.close() result = True print_debug("The input StarDict file contains a .syn file, parsing it... done", args.debug) else: print_debug("The input StarDict file does not contain a .syn file", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result for input_file_path in input_file_paths: print_debug("Reading from file '%s'..." % (input_file_path), args.debug) result = read_single_file(dictionary, args, input_file_path) if result: print_debug("Reading from file '%s'... success" % (input_file_path), args.debug) else: print_error("Reading from file '%s'... failed" % (input_file_path)) return None return dictionary
def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) # find .ifo, .idx, .dict[.dz] and .syn files inside the zip # and extract them to tmp_path input_file_obj = zipfile.ZipFile(input_file_path) found_files = find_files(input_file_obj.namelist()) extracted_files = {} if len(found_files) > 0: for key in found_files: entry = found_files[key] ext_file_path = os.path.join(tmp_path, key) ext_file_obj = io.open(ext_file_path, "wb") zip_entry = input_file_obj.open(entry) ext_file_obj.write(zip_entry.read()) zip_entry.close() ext_file_obj.close() print_debug("Extracted %s" % (ext_file_path), args.debug) extracted_files[key] = ext_file_path # extract from compressed file, but only if ".idx" is not present as well if (key == "d.idx.gz") and ("d.idx" not in found_files): extracted_files["d.idx"] = uncompress_file( ext_file_path, tmp_path, "d.idx") # extract from compressed file, but only if ".dict" is not present as well if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files): extracted_files["d.dict"] = uncompress_file( ext_file_path, tmp_path, "d.dict") input_file_obj.close() # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn has_syn = "d.syn" in extracted_files if (has_syn) and (args.ignore_synonyms): has_syn = False print_debug( "Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug) ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args) print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug) # read dict file dict_file_obj = io.open(extracted_files["d.dict"], "rb") dict_file_bytes = dict_file_obj.read() dict_file_obj.close() # read idx file idx_file_obj = io.open(extracted_files["d.idx"], "rb") byte_read = idx_file_obj.read(1) headword = b"" while byte_read: if byte_read == b"\0": # end of current word: read offset and size offset_bytes = idx_file_obj.read(4) offset_int = int((struct.unpack('>i', offset_bytes))[0]) size_bytes = idx_file_obj.read(4) size_int = int((struct.unpack('>i', size_bytes))[0]) definition = dict_file_bytes[offset_int:( offset_int + size_int)].decode(args.input_file_encoding) headword = headword.decode("utf-8") if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) headword = b"" else: # read next byte headword += byte_read byte_read = idx_file_obj.read(1) idx_file_obj.close() result = True # read syn file, if present if has_syn: print_debug( "The input StarDict file contains a .syn file, parsing it...", args.debug) result = False syn_file_obj = io.open(extracted_files["d.syn"], "rb") byte_read = syn_file_obj.read(1) synonym = b"" while byte_read: if byte_read == b"\0": # end of current synonym: read index of original word index_bytes = syn_file_obj.read(4) index_int = int((struct.unpack('>i', index_bytes))[0]) synonym = synonym.decode("utf-8") if index_int < len(dictionary): dictionary.add_synonym(synonym=synonym, headword_index=index_int) else: # emit a warning? print_debug( "Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug) synonym = b"" else: # read next byte synonym += byte_read byte_read = syn_file_obj.read(1) syn_file_obj.close() result = True print_debug( "The input StarDict file contains a .syn file, parsing it... done", args.debug) else: print_debug("The input StarDict file does not contain a .syn file", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp file tmp_handler, tmp_path = create_temp_file() # copy the index file from the zip to the tmp file input_file_obj = zipfile.ZipFile(input_file_path) tmp_file_obj = io.open(tmp_path, "wb") tmp_file_obj.write(input_file_obj.read(WORDS_FILE_NAME)) tmp_file_obj.close() input_file_obj.close() # read index with MARISA try: # call MARISA with marisa_trie module import marisa_trie trie = marisa_trie.Trie() trie.load(tmp_path) for pair in trie.items(): dictionary.add_entry(headword=pair[0], definition=u"") result = True except ImportError as exc: # call MARISA with subprocess print_info( " MARISA cannot be imported as Python module. You might want to install it with:" ) print_info(" $ [sudo] pip install marisa_trie") marisa_reverse_lookup_path = MARISA_REVERSE_LOOKUP if args.marisa_bin_path is None: print_info(" Running '%s' from $PATH" % MARISA_REVERSE_LOOKUP) else: marisa_reverse_lookup_path = os.path.join( args.marisa_bin_path, MARISA_REVERSE_LOOKUP) print_info(" Running '%s' from '%s'" % (MARISA_REVERSE_LOOKUP, args.marisa_bin_path)) # TODO this is ugly, but it works query = (u"\n".join( [str(x) for x in range(int(args.marisa_index_size))]) + u"\n").encode("utf-8") try: proc = subprocess.Popen([marisa_reverse_lookup_path, tmp_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) stdout = proc.communicate(input=query)[0].decode("utf-8") for line in stdout.splitlines(): array = line.split("\t") if len(array) >= 2: key = array[1] if args.ignore_case: key = key.lower() dictionary.add_entry(headword=key, definition=u"") result = True except OSError as exc: print_error( " Unable to run '%s' as '%s'" % (MARISA_REVERSE_LOOKUP, marisa_reverse_lookup_path)) print_error(" Please make sure '%s':" % MARISA_REVERSE_LOOKUP) print_error(" 1. is available on your $PATH or") print_error( " 2. specify its path with --marisa-bin-path or") print_error(" 3. install the marisa_trie Python module") except: print_debug("Reading from file '%s'... failed" % (input_file_path)) # delete the tmp file delete_file(tmp_handler, tmp_path) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # get absolute path for collation function file bookeen_collation_function_path = None if args.bookeen_collation_function is not None: bookeen_collation_function_path = os.path.abspath(args.bookeen_collation_function) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] # copy empty.idx into tmp_path idx_file_path = base + u".dict.idx" dict_file_path = base + u".dict" copy_file(EMPTY_FILE_PATH, idx_file_path) # open index sql_connection = sqlite3.connect(idx_file_path) # install collation in the index collation_function = collate_function_default if bookeen_collation_function_path is not None: try: collation_function = imp.load_source("", bookeen_collation_function_path).collate_function print_debug("Using collation function from '%s'" % (bookeen_collation_function_path), args.debug) except: print_error("Unable to load collation function from '%s'. Using the default collation function instead." % (bookeen_collation_function_path)) sql_connection.create_collation("IcuNoCase", collation_function) sql_connection.text_factory = str # get a cursor and delete any data from the index file sql_cursor = sql_connection.cursor() sql_cursor.execute("delete from T_DictIndex") # write c_* files # each c_* file has MAX_CHUNK_SIZE < size <= (MAX_CHUNK_SIZE * 2) bytes (tentatively) print_debug("Writing c_* files...", args.debug) files_to_compress = [] current_offset = 0 chunk_index = 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = io.open(chunk_file_path, "wb") for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) chunk_file_obj.write(definition_bytes) # insert headword into index file sql_tuple = (0, entry.headword, current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # insert synonyms into index file if not args.ignore_synonyms: for synonym in entry.get_synonyms(): sql_tuple = (0, synonym[0], current_offset, definition_size, chunk_index) sql_cursor.execute("insert into T_DictIndex values (?,?,?,?,?)", sql_tuple) # update offset current_offset += definition_size # if we reached CHUNK_SIZE, open the next c_* file if current_offset > CHUNK_SIZE: chunk_file_obj.close() chunk_index += 1 chunk_file_path = "%s%d" % (CHUNK_FILE_PREFIX, chunk_index) files_to_compress.append(chunk_file_path) chunk_file_obj = io.open(chunk_file_path, "wb") current_offset = 0 chunk_file_obj.close() print_debug("Writing c_* files... done", args.debug) # compress print_debug("Compressing c_* files...", args.debug) file_zip_obj = zipfile.ZipFile(dict_file_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() print_debug("Compressing c_* files... done", args.debug) # update index metadata print_debug("Updating index metadata...", args.debug) header = HEADER % (args.language_from) sql_cursor.execute("update T_DictInfo set F_xhtmlHeader=?", (header,)) sql_cursor.execute("update T_DictInfo set F_LangFrom=?", (args.language_from,)) sql_cursor.execute("update T_DictInfo set F_LangTo=?", (args.language_to,)) sql_cursor.execute("update T_DictInfo set F_Licence=?", (args.license,)) sql_cursor.execute("update T_DictInfo set F_Copyright=?", (args.copyright,)) sql_cursor.execute("update T_DictInfo set F_Title=?", (args.title,)) sql_cursor.execute("update T_DictInfo set F_Description=?", (args.description,)) sql_cursor.execute("update T_DictInfo set F_Year=?", (args.year,)) # the meaning of the following is unknown sql_cursor.execute("update T_DictInfo set F_Alphabet=?", ("Z",)) sql_cursor.execute("update T_DictInfo set F_CollationLevel=?", ("1",)) sql_cursor.execute("update T_DictVersion set F_DictType=?", ("stardict",)) sql_cursor.execute("update T_DictVersion set F_Version=?", ("11",)) print_debug("Updating index metadata... done", args.debug) # compact and close sql_cursor.execute("vacuum") sql_cursor.close() sql_connection.close() # create .install file or copy .dict.idx and .dict into requested output directory parent_output_directory = os.path.split(output_file_path_absolute)[0] if args.bookeen_install_file: print_debug("Creating .install file...", args.debug) file_zip_path = os.path.join(parent_output_directory, base + u".install") file_zip_obj = zipfile.ZipFile(file_zip_path, "w", zipfile.ZIP_DEFLATED) for file_to_compress in [dict_file_path, idx_file_path]: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) file_zip_obj.close() result = [file_zip_path] print_debug("Creating .install file... done", args.debug) else: print_debug("Copying .dict.idx and .dict files...", args.debug) dict_file_path_final = os.path.join(parent_output_directory, os.path.basename(dict_file_path)) idx_file_path_final = os.path.join(parent_output_directory, os.path.basename(idx_file_path)) copy_file(dict_file_path, dict_file_path_final) copy_file(idx_file_path, idx_file_path_final) result = [idx_file_path_final, dict_file_path_final] print_debug("Copying .dict.idx and .dict files... done", args.debug) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # sort by headword, optionally ignoring case dictionary.sort(by_headword=True, ignore_case=args.sort_ignore_case) # create groups special_group, group_keys, group_dict = dictionary.group( prefix_function_path=args.group_by_prefix_function, prefix_length=int(args.group_by_prefix_length), merge_min_size=int(args.group_by_prefix_merge_min_size), merge_across_first=args.group_by_prefix_merge_across_first ) all_group_keys = group_keys if special_group is not None: all_group_keys += [u"SPECIAL"] # create mobi object mobi = DictionaryEbook(ebook_format=DictionaryEbook.MOBI, args=args) # add groups for key in all_group_keys: if key == u"SPECIAL": group_entries = special_group else: group_entries = group_dict[key] mobi.add_group(key, group_entries) # create output file print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) mobi.write(output_file_path_absolute, compress=False) result = [output_file_path] print_debug("Writing to file '%s'... done" % (output_file_path_absolute), args.debug) # run kindlegen tmp_path = mobi.get_tmp_path() if args.mobi_no_kindlegen: print_info("Not running kindlegen, the raw files are located in '%s'" % tmp_path) result = [tmp_path] else: try: print_debug("Creating .mobi file with kindlegen...", args.debug) kindlegen_path = KINDLEGEN opf_file_path_absolute = os.path.join(tmp_path, "OEBPS", "content.opf") mobi_file_path_relative = u"content.mobi" mobi_file_path_absolute = os.path.join(tmp_path, "OEBPS", mobi_file_path_relative) if args.kindlegen_path is None: print_info(" Running '%s' from $PATH" % KINDLEGEN) else: kindlegen_path = args.kindlegen_path print_info(" Running '%s' from '%s'" % (KINDLEGEN, kindlegen_path)) proc = subprocess.Popen( [kindlegen_path, opf_file_path_absolute, "-o", mobi_file_path_relative], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE ) output = proc.communicate() if args.debug: output_unicode = (output[0]).decode("utf-8") print_debug(output_unicode, args.debug) copy_file(mobi_file_path_absolute, output_file_path_absolute) result = [output_file_path] print_debug("Creating .mobi file with kindlegen... done", args.debug) except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (KINDLEGEN, kindlegen_path)) print_error(" Please make sure '%s':" % KINDLEGEN) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --kindlegen-path") # delete tmp directory tmp_path = mobi.get_tmp_path() if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: mobi.delete() print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_single_dict(dictionary, args, single_dict): # create tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) if len(single_dict) == 1: print_debug("Unzipping .install file...", args.debug) zip_file_path = single_dict[0] idx_file_path = os.path.join(tmp_path, "d.dict.idx") dict_file_path = os.path.join(tmp_path, "d.dict") zip_file_obj = zipfile.ZipFile(zip_file_path, "r") for entry in zip_file_obj.namelist(): if entry.endswith(".dict.idx"): zip_entry = zip_file_obj.open(entry) idx_file_obj = io.open(idx_file_path, "wb") idx_file_obj.write(zip_entry.read()) idx_file_obj.close() zip_entry.close() elif entry.endswith(".dict"): zip_entry = zip_file_obj.open(entry) dict_file_obj = io.open(dict_file_path, "wb") dict_file_obj.write(zip_entry.read()) dict_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .install file... done", args.debug) else: print_debug("Files .dict.idx and .dict already uncompressed...", args.debug) idx_file_path = single_dict[0] dict_file_path = single_dict[1] for file_path in [idx_file_path, dict_file_path]: if not os.path.exists(file_path): print_error("File '%s' does not exist" % file_path) return False print_debug("Files .dict.idx and .dict already uncompressed... done", args.debug) # unzip .dict file into tmp_path print_debug("Unzipping .dict file...", args.debug) zip_file_obj = zipfile.ZipFile(dict_file_path, "r") for entry in zip_file_obj.namelist(): if not entry.endswith("/"): zip_entry = zip_file_obj.open(entry) entry_file_path = os.path.join(tmp_path, os.path.basename(entry)) entry_file_obj = io.open(entry_file_path, "wb") entry_file_obj.write(zip_entry.read()) entry_file_obj.close() zip_entry.close() zip_file_obj.close() print_debug("Unzipping .dict file... done", args.debug) # read .dict.idx print_debug("Reading .dict.idx file...", args.debug) sql_connection = sqlite3.connect(idx_file_path) sql_cursor = sql_connection.cursor() sql_cursor.execute("select * from T_DictIndex") index_data = sql_cursor.fetchall() chunk_index_to_entries = {} max_chunk_index = 1 for index_entry in index_data: headword = index_entry[1] if args.ignore_case: headword = headword.lower() offset = index_entry[2] size = index_entry[3] chunk_index = index_entry[4] if chunk_index not in chunk_index_to_entries: chunk_index_to_entries[chunk_index] = [] if chunk_index > max_chunk_index: max_chunk_index = chunk_index chunk_index_to_entries[chunk_index].append([headword, offset, size]) sql_cursor.close() sql_connection.close() print_debug("Reading .dict.idx file... done", args.debug) # read c_* files print_debug("Reading c_* files...", args.debug) for chunk_index in range(1, max_chunk_index + 1): print_debug(" Reading c_%d file..." % (chunk_index), args.debug) chunk_file_path = os.path.join(tmp_path, "%s%d" % (CHUNK_FILE_PREFIX, chunk_index)) chunk_file_obj = io.open(chunk_file_path, "rb") for entry in chunk_index_to_entries[chunk_index]: headword = entry[0] offset = entry[1] size = entry[2] chunk_file_obj.seek(offset) definition_bytes = chunk_file_obj.read(size) definition_unicode = definition_bytes.decode(args.input_file_encoding) dictionary.add_entry(headword=headword, definition=definition_unicode) chunk_file_obj.close() print_debug(" Reading c_%d file... done" % (chunk_index), args.debug) print_debug("Reading c_* files... done", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return True
def read_single_file(dictionary, args, input_file_path): # result flag result = False # create a tmp directory tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) # find .ifo, .idx, .dict[.dz] and .syn files inside the zip # and extract them to tmp_path input_file_obj = zipfile.ZipFile(input_file_path) found_files = find_files(input_file_obj.namelist()) extracted_files = {} if len(found_files) > 0: for key in found_files: entry = found_files[key] ext_file_path = os.path.join(tmp_path, key) ext_file_obj = open(ext_file_path, "wb") zip_entry = input_file_obj.open(entry) ext_file_obj.write(zip_entry.read()) zip_entry.close() ext_file_obj.close() print_debug("Extracted %s" % (ext_file_path), args.debug) extracted_files[key] = ext_file_path # extract from compressed file, but only if ".idx" is not present as well if (key == "d.idx.gz") and ("d.idx" not in found_files): extracted_files["d.idx"] = uncompress_file(ext_file_path, tmp_path, "d.idx") # extract from compressed file, but only if ".dict" is not present as well if ((key == "d.dict.dz") or (key == "d.dz")) and ("d.dict" not in found_files): extracted_files["d.dict"] = uncompress_file(ext_file_path, tmp_path, "d.dict") input_file_obj.close() # here we have d.ifo, d.idx and d.dict (all uncompressed) and possibly d.syn has_syn = "d.syn" in extracted_files if (has_syn) and (args.ignore_synonyms): has_syn = False print_debug("Dictionary has synonyms, but ignoring them (--ignore-synonym)", args.debug) ifo_dict = read_ifo(extracted_files["d.ifo"], has_syn, args) print_debug("Read .ifo file with values:\n%s" % (str(ifo_dict)), args.debug) # read dict file dict_file_obj = open(extracted_files["d.dict"], "rb") dict_file_bytes = dict_file_obj.read() dict_file_obj.close() # read idx file idx_file_obj = open(extracted_files["d.idx"], "rb") byte_read = idx_file_obj.read(1) headword = b"" while byte_read: if byte_read == b"\0": # end of current word: read offset and size offset_bytes = idx_file_obj.read(4) offset_int = int((struct.unpack('>i', offset_bytes))[0]) size_bytes = idx_file_obj.read(4) size_int = int((struct.unpack('>i', size_bytes))[0]) definition = dict_file_bytes[offset_int:offset_int+size_int].decode(args.input_file_encoding) headword = headword.decode("utf-8") if args.ignore_case: headword = headword.lower() dictionary.add_entry(headword=headword, definition=definition) headword = b"" else: # read next byte headword += byte_read byte_read = idx_file_obj.read(1) idx_file_obj.close() result = True # read syn file, if present if has_syn: print_debug("The input StarDict file contains a .syn file, parsing it...", args.debug) result = False syn_file_obj = open(extracted_files["d.syn"], "rb") byte_read = syn_file_obj.read(1) synonym = b"" while byte_read: if byte_read == b"\0": # end of current synonym: read index of original word index_bytes = syn_file_obj.read(4) index_int = int((struct.unpack('>i', index_bytes))[0]) synonym = synonym.decode("utf-8") if index_int < len(dictionary): dictionary.add_synonym(synonym=synonym, headword_index=index_int) else: # emit a warning? print_debug("Synonym '%s' points to index %d >= len(dictionary), skipping it" % (index_int, synonym), args.debug) synonym = b"" else: # read next byte synonym += byte_read byte_read = syn_file_obj.read(1) syn_file_obj.close() result = True print_debug("The input StarDict file contains a .syn file, parsing it... done", args.debug) else: print_debug("The input StarDict file does not contain a .syn file", args.debug) # delete tmp directory if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def write(dictionary, args, output_file_path): # result to be returned result = None # get absolute path output_file_path_absolute = os.path.abspath(output_file_path) # create tmp directory cwd = os.getcwd() tmp_path = create_temp_directory() print_debug("Working in temp dir '%s'" % (tmp_path), args.debug) os.chdir(tmp_path) # get the basename and compute output file paths base = os.path.basename(output_file_path) if base.endswith(".zip"): base = base[:-4] ifo_file_path = base + ".ifo" idx_file_path = base + ".idx" dict_file_path = base + ".dict" dict_dz_file_path = base + ".dict.dz" syn_file_path = base + ".syn" # TODO by spec, the index should be sorted # TODO using the comparator stardict_strcmp() defined in the spec # TODO (it calls g_ascii_strcasecmp() and/or strcmp() ), # TODO or with a user-defined collation function # # From https://developer.gnome.org/glib/2.28/glib-String-Utility-Functions.html#g-ascii-strcasecmp # gint g_ascii_strcasecmp (const gchar *s1, const gchar *s2); # Compare two strings, ignoring the case of ASCII characters. # Unlike the BSD strcasecmp() function, this only recognizes standard ASCII letters and ignores the locale, treating all non-ASCII bytes as if they are not letters. # This function should be used only on strings that are known to be in encodings where the bytes corresponding to ASCII letters always represent themselves. This includes UTF-8 and the ISO-8859-* charsets, but not for instance double-byte encodings like the Windows Codepage 932, where the trailing bytes of double-byte characters include all ASCII letters. If you compare two CP932 strings using this function, you will get false matches. # # using Python's builtin lower() and sort() by headword # should be equivalent for UTF-8 encoded dictionaries (and it is fast) # dictionary.sort(by_headword=True, ignore_case=True) # write .idx and .dict files print_debug("Writing .idx and .dict files...", args.debug) idx_file_obj = open(idx_file_path, "wb") dict_file_obj = open(dict_file_path, "wb") current_offset = 0 current_idx_size = 0 for entry_index in dictionary.entries_index_sorted: entry = dictionary.entries[entry_index] headword_bytes = entry.headword.encode("utf-8") definition_bytes = entry.definition.encode("utf-8") definition_size = len(definition_bytes) # write .idx idx_file_obj.write(headword_bytes) idx_file_obj.write(b"\0") idx_file_obj.write(struct.pack('>i', current_offset)) idx_file_obj.write(struct.pack('>i', definition_size)) current_idx_size += (len(headword_bytes) + 1 + 4 + 4) # write .dict dict_file_obj.write(definition_bytes) current_offset += definition_size idx_file_obj.close() dict_file_obj.close() print_debug("Writing .idx and .dict files... done", args.debug) # list files to compress files_to_compress = [] files_to_compress.append(ifo_file_path) files_to_compress.append(idx_file_path) # write .syn file dict_syns_len = 0 if dictionary.has_synonyms: if args.ignore_synonyms: print_debug("Dictionary has synonyms, but ignoring them", args.debug) else: print_debug("Dictionary has synonyms, writing .syn file...", args.debug) syn_file_obj = open(syn_file_path, "wb") dict_syns = dictionary.get_synonyms() dict_syns_len = len(dict_syns) for pair in dict_syns: synonym_bytes = pair[0].encode("utf-8") index = pair[1] syn_file_obj.write(synonym_bytes) syn_file_obj.write(b"\0") syn_file_obj.write(struct.pack('>i', index)) syn_file_obj.close() files_to_compress.append(syn_file_path) print_debug("Dictionary has synonyms, writing .syn file... done", args.debug) # compress .dict file if args.sd_no_dictzip: print_debug("Not compressing .dict file with dictzip", args.debug) files_to_compress.append(dict_file_path) result = [dict_file_path] else: try: print_debug("Compressing .dict file with dictzip...", args.debug) dictzip_path = DICTZIP if args.dictzip_path is None: print_info(" Running '%s' from $PATH" % DICTZIP) else: dictzip_path = args.dictzip_path print_info(" Running '%s' from '%s'" % (DICTZIP, dictzip_path)) proc = subprocess.Popen( [dictzip_path, "-k", dict_file_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE ) proc.communicate() result = [dict_dz_file_path] files_to_compress.append(dict_dz_file_path) print_debug("Compressing .dict file with dictzip... done", args.debug) except OSError as exc: print_error(" Unable to run '%s' as '%s'" % (DICTZIP, dictzip_path)) print_error(" Please make sure '%s':" % DICTZIP) print_error(" 1. is available on your $PATH or") print_error(" 2. specify its path with --dictzip-path or") print_error(" 3. specify --no-dictzip to avoid compressing the .dict file") result = None if result is not None: # create ifo file ifo_file_obj = open(ifo_file_path, "wb") ifo_file_obj.write((u"StarDict's dict ifo file\n").encode("utf-8")) ifo_file_obj.write((u"version=2.4.2\n").encode("utf-8")) ifo_file_obj.write((u"wordcount=%d\n" % (len(dictionary))).encode("utf-8")) ifo_file_obj.write((u"idxfilesize=%d\n" % (current_idx_size)).encode("utf-8")) ifo_file_obj.write((u"bookname=%s\n" % (args.title)).encode("utf-8")) ifo_file_obj.write((u"date=%s\n" % (args.year)).encode("utf-8")) ifo_file_obj.write((u"sametypesequence=m\n").encode("utf-8")) ifo_file_obj.write((u"description=%s\n" % (args.description)).encode("utf-8")) ifo_file_obj.write((u"author=%s\n" % (args.author)).encode("utf-8")) ifo_file_obj.write((u"email=%s\n" % (args.email)).encode("utf-8")) ifo_file_obj.write((u"website=%s\n" % (args.website)).encode("utf-8")) if dict_syns_len > 0: ifo_file_obj.write((u"synwordcount=%d\n" % (dict_syns_len)).encode("utf-8")) ifo_file_obj.close() # create output zip file try: print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug) file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED) for file_to_compress in files_to_compress: file_to_compress = os.path.basename(file_to_compress) file_zip_obj.write(file_to_compress) print_debug("Written %s" % (file_to_compress), args.debug) file_zip_obj.close() result = [output_file_path] print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug) except: print_error("Writing to file '%s'... failure" % (output_file_path_absolute)) # delete tmp directory os.chdir(cwd) if args.keep: print_info("Not deleting temp dir '%s'" % (tmp_path)) else: delete_directory(tmp_path) print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug) return result
def read_ifo(ifo_path, has_syn, args): ifo_dict = {} ifo_obj = io.open(ifo_path, "rb") ifo_bytes = ifo_obj.read() # bytes ifo_unicode = ifo_bytes.decode( "utf-8") # unicode, always utf-8 by spec ifo_obj.close() for line in ifo_unicode.splitlines(): array = line.split("=") if len(array) >= 2: key = array[0] val = "=".join(array[1:]) ifo_dict[key] = val if "version" not in ifo_dict: print_error( "No 'version' found in the .ifo file (see StarDict spec)") return None if ifo_dict["version"] not in ["2.4.2", "3.0.0"]: print_error( "The .ifo file must have a 'version' value equal to '2.4.2' or '3.0.0' (see StarDict spec)" ) return None required_keys = ["bookname", "wordcount", "idxfilesize"] if has_syn: required_keys.append("synwordcount") # TODO not used => disabling this # if ifo_dict["version"] == "3.0.0": # required_keys.append("idxoffsetbits") for key in required_keys: if key not in ifo_dict: print_error( "No '%s' found in the .ifo file (see StarDict spec)" % key) return None ifo_dict["wordcount"] = int(ifo_dict["wordcount"]) ifo_dict["idxfilesize"] = int(ifo_dict["idxfilesize"]) if has_syn: ifo_dict["synwordcount"] = int(ifo_dict["synwordcount"]) # TODO not used => disabling this # if ifo_dict["version"] == "3.0.0": # ifo_dict["idxoffsetbits"] = int(ifo_dict["idxoffsetbits"]) if args.sd_ignore_sametypesequence: print_debug("Ignoring sametypesequence value", args.debug) else: # TODO limitation: we require sametypesequence to be present if "sametypesequence" not in ifo_dict: print_error( "The .ifo file must have a 'sametypesequence' value (see README)." ) return None # TODO limitation: we require sametypesequence to have a value in SAMETYPESEQUENCE_SUPPORTED_VALUES if not ifo_dict[ "sametypesequence"] in SAMETYPESEQUENCE_SUPPORTED_VALUES: print_error( "The .ifo file must have a 'sametypesequence' value of %s (see README)." % "|".join(SAMETYPESEQUENCE_SUPPORTED_VALUES)) return None return ifo_dict