Example #1
0
    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp file
        tmp_handler, tmp_path = create_temp_file()

        # copy the index file from the zip to the tmp file
        input_file_obj = zipfile.ZipFile(input_file_path)
        tmp_file_obj = io.open(tmp_path, "wb")
        tmp_file_obj.write(input_file_obj.read(WORDS_FILE_NAME))
        tmp_file_obj.close()
        input_file_obj.close()

        # read index with MARISA
        try:
            # call MARISA with marisa_trie module
            import marisa_trie
            trie = marisa_trie.Trie()
            trie.load(tmp_path)
            for pair in trie.items():
                dictionary.add_entry(headword=pair[0], definition=u"")
            result = True
        except ImportError as exc:
            # call MARISA with subprocess
            print_info("  MARISA cannot be imported as Python module. You might want to install it with:")
            print_info("  $ [sudo] pip install marisa_trie")
            marisa_reverse_lookup_path = MARISA_REVERSE_LOOKUP
            if args.marisa_bin_path is None:
                print_info("  Running '%s' from $PATH" % MARISA_REVERSE_LOOKUP)
            else:
                marisa_reverse_lookup_path = os.path.join(args.marisa_bin_path, MARISA_REVERSE_LOOKUP)
                print_info("  Running '%s' from '%s'" % (MARISA_REVERSE_LOOKUP, args.marisa_bin_path))
            # TODO this is ugly, but it works
            query = (u"\n".join([str(x) for x in range(int(args.marisa_index_size))]) + u"\n").encode("utf-8")

            try:
                proc = subprocess.Popen(
                    [marisa_reverse_lookup_path, tmp_path],
                    stdout=subprocess.PIPE,
                    stdin=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
                stdout = proc.communicate(input=query)[0].decode("utf-8")
                for line in stdout.splitlines():
                    array = line.split("\t")
                    if len(array) >= 2:
                        key = array[1]
                        if args.ignore_case:
                            key = key.lower()
                        dictionary.add_entry(headword=key, definition=u"")
                result = True
            except OSError as exc:
                print_error("  Unable to run '%s' as '%s'" % (MARISA_REVERSE_LOOKUP, marisa_reverse_lookup_path))
                print_error("  Please make sure '%s':" % MARISA_REVERSE_LOOKUP)
                print_error("    1. is available on your $PATH or")
                print_error("    2. specify its path with --marisa-bin-path or")
                print_error("    3. install the marisa_trie Python module")
        except:
            print_debug("Reading from file '%s'... failed" % (input_file_path))

        # delete the tmp file
        delete_file(tmp_handler, tmp_path)
        return result
Example #2
0
    def read_single_file(dictionary, args, input_file_path):
        # result flag
        result = False

        # create a tmp file
        tmp_handler, tmp_path = create_temp_file()

        # copy the index file from the zip to the tmp file
        input_file_obj = zipfile.ZipFile(input_file_path)
        tmp_file_obj = io.open(tmp_path, "wb")
        tmp_file_obj.write(input_file_obj.read(WORDS_FILE_NAME))
        tmp_file_obj.close()
        input_file_obj.close()

        # read index with MARISA
        try:
            # call MARISA with marisa_trie module
            import marisa_trie
            trie = marisa_trie.Trie()
            trie.load(tmp_path)
            for pair in trie.items():
                dictionary.add_entry(headword=pair[0], definition=u"")
            result = True
        except ImportError as exc:
            # call MARISA with subprocess
            print_info(
                "  MARISA cannot be imported as Python module. You might want to install it with:"
            )
            print_info("  $ [sudo] pip install marisa_trie")
            marisa_reverse_lookup_path = MARISA_REVERSE_LOOKUP
            if args.marisa_bin_path is None:
                print_info("  Running '%s' from $PATH" % MARISA_REVERSE_LOOKUP)
            else:
                marisa_reverse_lookup_path = os.path.join(
                    args.marisa_bin_path, MARISA_REVERSE_LOOKUP)
                print_info("  Running '%s' from '%s'" %
                           (MARISA_REVERSE_LOOKUP, args.marisa_bin_path))
            # TODO this is ugly, but it works
            query = (u"\n".join(
                [str(x) for x in range(int(args.marisa_index_size))]) +
                     u"\n").encode("utf-8")

            try:
                proc = subprocess.Popen([marisa_reverse_lookup_path, tmp_path],
                                        stdout=subprocess.PIPE,
                                        stdin=subprocess.PIPE,
                                        stderr=subprocess.PIPE)
                stdout = proc.communicate(input=query)[0].decode("utf-8")
                for line in stdout.splitlines():
                    array = line.split("\t")
                    if len(array) >= 2:
                        key = array[1]
                        if args.ignore_case:
                            key = key.lower()
                        dictionary.add_entry(headword=key, definition=u"")
                result = True
            except OSError as exc:
                print_error(
                    "  Unable to run '%s' as '%s'" %
                    (MARISA_REVERSE_LOOKUP, marisa_reverse_lookup_path))
                print_error("  Please make sure '%s':" % MARISA_REVERSE_LOOKUP)
                print_error("    1. is available on your $PATH or")
                print_error(
                    "    2. specify its path with --marisa-bin-path or")
                print_error("    3. install the marisa_trie Python module")
        except:
            print_debug("Reading from file '%s'... failed" % (input_file_path))

        # delete the tmp file
        delete_file(tmp_handler, tmp_path)
        return result
Example #3
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # sort by headword
    dictionary.sort(by_headword=True)

    # group by prefix
    files_to_compress = []
    prefix_length = int(args.group_by_prefix_length)
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function=get_prefix_kobo,
        prefix_length=prefix_length,
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first
    )
    if special_group is not None:
        special_group_key = u"1" * prefix_length
        group_dict[special_group_key] = special_group
        group_keys = [special_group_key] + group_keys

    # write files
    for key in group_keys:
        # write html file
        file_html_path = key + u".html"
        file_html_obj = io.open(file_html_path, "wb")
        file_html_obj.write(u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode("utf-8"))
        for entry in group_dict[key]:
            headword = entry.headword
            definition = entry.definition
            file_html_obj.write((u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" % (headword, headword, definition)).encode("utf-8"))
        file_html_obj.write((u"</html>").encode("utf-8"))
        file_html_obj.close()

        # compress in gz format
        file_html_obj = io.open(file_html_path, "rb")
        file_gz_path = file_html_path + u".gz"
        file_gz_obj = gzip.open(file_gz_path, "wb")
        file_gz_obj.writelines(file_html_obj)
        file_gz_obj.close()
        file_html_obj.close()

        # delete .html file
        delete_file(None, file_html_path)
        # rename .html.gz file into .html
        rename_file(file_gz_path, file_html_path)
        files_to_compress.append(file_html_path)

    # write words
    file_words_path = WORDS_FILE_NAME
    keys = sorted(dictionary.entries_index.keys())
    try:
        import marisa_trie
        trie = marisa_trie.Trie(keys)
        trie.save(file_words_path)
        result = [file_words_path]
    except ImportError as exc:
        # call MARISA with subprocess
        print_info("  MARISA cannot be imported as Python module. You might want to install it with:")
        print_info("  $ [sudo] pip install marisa_trie")
        marisa_build_path = MARISA_BUILD
        if args.marisa_bin_path is None:
            print_info("  Running '%s' from $PATH" % MARISA_BUILD)
        else:
            marisa_build_path = os.path.join(args.marisa_bin_path, MARISA_BUILD)
            print_info("  Running '%s' from '%s'" % (MARISA_BUILD, args.marisa_bin_path))
        # TODO this is ugly, but it works
        query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8")

        try:
            proc = subprocess.Popen(
                [marisa_build_path, "-l", "-o", file_words_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            proc.communicate(input=query)[0].decode("utf-8")
            result = [file_words_path]
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" % (MARISA_BUILD, marisa_build_path))
            print_error("  Please make sure '%s':" % MARISA_BUILD)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --marisa-bin-path or")
            print_error("    3. install the marisa_trie Python module")
            result = None

    if result is not None:
        # add file_words_path to files to compress
        files_to_compress.append(file_words_path)
        # create output zip file
        try:
            print_debug("Writing to file '%s'..." % (output_file_path_absolute), args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w", zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug("Writing to file '%s'... success" % (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" % (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result
Example #4
0
def write(dictionary, args, output_file_path):
    # result to be returned
    result = None

    # get absolute path
    output_file_path_absolute = os.path.abspath(output_file_path)

    # create tmp directory
    cwd = os.getcwd()
    tmp_path = create_temp_directory()
    print_debug("Working in temp dir '%s'" % (tmp_path), args.debug)
    os.chdir(tmp_path)

    # sort by headword
    dictionary.sort(by_headword=True)

    # group by prefix
    files_to_compress = []
    prefix_length = int(args.group_by_prefix_length)
    special_group, group_keys, group_dict = dictionary.group(
        prefix_function=get_prefix_kobo,
        prefix_length=prefix_length,
        merge_min_size=int(args.group_by_prefix_merge_min_size),
        merge_across_first=args.group_by_prefix_merge_across_first)
    if special_group is not None:
        special_group_key = u"1" * prefix_length
        group_dict[special_group_key] = special_group
        group_keys = [special_group_key] + group_keys

    # write files
    for key in group_keys:
        # write html file
        file_html_path = key + u".html"
        file_html_obj = io.open(file_html_path, "wb")
        file_html_obj.write(
            u"<?xml version=\"1.0\" encoding=\"utf-8\"?><html>".encode(
                "utf-8"))
        for entry in group_dict[key]:
            headword = entry.headword
            definition = entry.definition
            file_html_obj.write(
                (u"<w><a name=\"%s\"/><div><b>%s</b><br/>%s</div></w>" %
                 (headword, headword, definition)).encode("utf-8"))
        file_html_obj.write((u"</html>").encode("utf-8"))
        file_html_obj.close()

        # compress in gz format
        file_html_obj = io.open(file_html_path, "rb")
        file_gz_path = file_html_path + u".gz"
        file_gz_obj = gzip.open(file_gz_path, "wb")
        file_gz_obj.writelines(file_html_obj)
        file_gz_obj.close()
        file_html_obj.close()

        # delete .html file
        delete_file(None, file_html_path)
        # rename .html.gz file into .html
        rename_file(file_gz_path, file_html_path)
        files_to_compress.append(file_html_path)

    # write words
    file_words_path = WORDS_FILE_NAME
    keys = sorted(dictionary.entries_index.keys())
    try:
        import marisa_trie
        trie = marisa_trie.Trie(keys)
        trie.save(file_words_path)
        result = [file_words_path]
    except ImportError as exc:
        # call MARISA with subprocess
        print_info(
            "  MARISA cannot be imported as Python module. You might want to install it with:"
        )
        print_info("  $ [sudo] pip install marisa_trie")
        marisa_build_path = MARISA_BUILD
        if args.marisa_bin_path is None:
            print_info("  Running '%s' from $PATH" % MARISA_BUILD)
        else:
            marisa_build_path = os.path.join(args.marisa_bin_path,
                                             MARISA_BUILD)
            print_info("  Running '%s' from '%s'" %
                       (MARISA_BUILD, args.marisa_bin_path))
        # TODO this is ugly, but it works
        query = (u"\n".join([x for x in keys]) + u"\n").encode("utf-8")

        try:
            proc = subprocess.Popen(
                [marisa_build_path, "-l", "-o", file_words_path],
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE)
            proc.communicate(input=query)[0].decode("utf-8")
            result = [file_words_path]
        except OSError as exc:
            print_error("  Unable to run '%s' as '%s'" %
                        (MARISA_BUILD, marisa_build_path))
            print_error("  Please make sure '%s':" % MARISA_BUILD)
            print_error("    1. is available on your $PATH or")
            print_error("    2. specify its path with --marisa-bin-path or")
            print_error("    3. install the marisa_trie Python module")
            result = None

    if result is not None:
        # add file_words_path to files to compress
        files_to_compress.append(file_words_path)
        # create output zip file
        try:
            print_debug(
                "Writing to file '%s'..." % (output_file_path_absolute),
                args.debug)
            file_zip_obj = zipfile.ZipFile(output_file_path_absolute, "w",
                                           zipfile.ZIP_DEFLATED)
            for file_to_compress in files_to_compress:
                file_to_compress = os.path.basename(file_to_compress)
                file_zip_obj.write(file_to_compress)
            file_zip_obj.close()
            result = [output_file_path]
            print_debug(
                "Writing to file '%s'... success" %
                (output_file_path_absolute), args.debug)
        except:
            print_error("Writing to file '%s'... failure" %
                        (output_file_path_absolute))

    # delete tmp directory
    os.chdir(cwd)
    if args.keep:
        print_info("Not deleting temp dir '%s'" % (tmp_path))
    else:
        delete_directory(tmp_path)
        print_debug("Deleted temp dir '%s'" % (tmp_path), args.debug)

    return result