Beispiel #1
0
def search(queries_path, dictionary_path, postings_path, output_path):
    """ Searches dictionary and postings for patents that matches the queries """
    global patent_info, dictionary
    dictionary = read_dictionary(dictionary_path)
    patent_info = util.load_dictionary(PATENT_INFO_PATH)
    query = Query(queries_path, dictionary, patent_info)
    initial_ranked_docs = query.get_ranked_docs()
    expanded_query = ExpandedQuery(query, initial_ranked_docs, patent_info)
    result = ' '.join(expanded_query.get_ranked_docs())
    with codecs.open(output_path, 'w', encoding='utf-8') as o:
        o.write(result)
    return result
Beispiel #2
0
def idf():
    word_doc_freq = {}
    util.print_message('Start counting idf...', debug=True)
    tag_dict = util.load_dictionary(settings.DATA_PATH, settings.USER_DICT)
    file_names = util.get_file_list(settings.WORD_COUNT_FILE_PATH)
    for file_name in file_names:
        util.print_message('Processing all word count on {0}', arg=file_name)
        word_count_dict = util.file2dict(settings.WORD_COUNT_FILE_PATH, file_name)
        for key in word_count_dict.iterkeys():
            if not tag_dict.has_key(key):
                continue
            if word_doc_freq.has_key(key):
                word_doc_freq[key] = word_doc_freq[key] + 1
            else:
                word_doc_freq[key] = 1
    util.save_sorted_dict(settings.DATA_PATH, settings.WDF_FILE, word_doc_freq)

    doc_number = len(file_names)
    inverse_doc_freq = {k: math.log(float(doc_number) / (1 + v))
                        for k, v in word_doc_freq.items()}

    util.save_sorted_dict(settings.DATA_PATH, settings.IDF_FILE, inverse_doc_freq)
    return inverse_doc_freq
def create_codes_by_fragment(codes, codeword_fragment, wordlist_directory):
    logger.info("Codeword fragment {}".format(codeword_fragment))
    logger.debug("Loading wordlist...")
    wordlist = util.load_dictionary(wordlist_directory)
    wordlist_sorted = [''.join(sorted(s)) for s in wordlist]
    logger.info("Wordlist size: {} words".format(len(wordlist)))

    codewords = word.find_codewords(codeword_fragment, wordlist)
    if len(codewords) == 0:
        return []
    elif len(codewords) > 10:
        logger.info("Possible codewords count: {}".format(len(codewords)))
    else:
        logger.info("Possible codewords: {}".format(", ".join(codewords)))

    logger.info("Creating codes based on codewords")
    process_count = min(len(codewords), os.cpu_count())
    input_queue = mp.Queue()
    output_queue = mp.Queue()
    worker_args = (codes, wordlist, wordlist_sorted, input_queue, output_queue)
    logger.info("Using {} subprocesses".format(process_count))
    processes = [
        mp.Process(target=worker, args=worker_args)
        for _ in range(process_count)
    ]

    [input_queue.put(cw) for cw in codewords]
    [p.start() for p in processes]
    results = []
    code_set = set()

    start_time = time.time()
    last_print = start_time
    start_words = len(codewords)
    while input_queue.qsize() > 0 or output_queue.qsize() > 0:
        try:
            result = output_queue.get(timeout=3)
            if not (result.code in code_set):
                code_set.add(result.code)
                results.append(result)
        except queue.Empty:
            pass

        if time.time() - last_print > 3:
            last_print = time.time()
            words_left = input_queue.qsize()
            done_count = start_words - words_left
            uptime = max(1.0, time.time() - start_time)
            words_per_second = max(1.0, done_count / uptime)
            eta = round(words_left / words_per_second)
            if eta >= 60:
                time_left = "{:02}m {:02}s".format(round(eta / 60), eta % 60)
            else:
                time_left = "{:02}s".format(eta)
            logger.info(
                "Codewords left: {:6} | Rate: {:4} Words/s | ETA: {} | Codes found: {}"
                .format(words_left, round(words_per_second), time_left,
                        len(results)))
    logger.info("Finished code creation, waiting for subprocesses to stop...")
    [p.join() for p in processes]
    logger.debug("All subprocesses stopped")
    return results
Beispiel #4
0
    print("model_dir: {}".format(model_dir))
    print("sent_file: {}".format(sent_file))
    print("conv_file: {}".format(conv_file))
    print("dict_file: {}".format(dict_file))
    print("word2vec_init: {}".format(word2vec_init))
    print("word2vec_model_file: {}".format(word2vec_model_file))
    print("min_freq: {}".format(min_freq))
    print("n_units: {}".format(n_units))
    print("epoch_size: {}".format(epoch_size))
    print("batch_size: {}".format(batch_size))
    print("dropout: {}".format(dropout))
    print("##############")

    # 辞書
    if os.path.exists(dict_file):
        dictionary = load_dictionary(dict_file)
    else:
        from util import create_dictionary
        dictionary = create_dictionary(
            [sent_file],
            min_freq=min_freq
        )
        dictionary.save(dict_file)

    # Prepare encoder RNN model
    dim = len(dictionary.keys())
    model_type = args.type
    if model_type == "relu":
        model = relu_rnn.Classifier(
            relu_rnn.ReLURNN(
                embed_dim=dim,
Beispiel #5
0
    gpu_flag = True if args.gpu >= 0 else False

    config_file = args.config_file
    parser_config = configparser.ConfigParser()
    parser_config.read(config_file)
    config = parser_config["CONFIG"]
    # config["SEPARATOR"] = bytes(
    #     config["DEFAULT"]["SEPARATOR"], "utf-8"
    # ).decode("unicode_escape")

    # params
    model_dir = config["model_dir"]
    n_units = int(config["n_units"])

    # load conversation sentences
    dictionary = load_dictionary(config["dict_file"])

    # Prepare encoder RNN model
    dim = len(dictionary.keys())
    model_type = args.type
    if model_type == "relu":
        import relu_rnn
        model = relu_rnn.Classifier(
            relu_rnn.ReLURNN(
                embed_dim=dim,
                n_units=int(config["n_units"]),
                gpu=args.gpu
            )
        )
    elif model_type == "lstm":
        import lstm
Beispiel #6
0
    print("model_dir: {}".format(model_dir))
    print("sent_file: {}".format(sent_file))
    print("conv_file: {}".format(conv_file))
    print("dict_file: {}".format(dict_file))
    print("word2vec_init: {}".format(word2vec_init))
    print("word2vec_model_file: {}".format(word2vec_model_file))
    print("min_freq: {}".format(min_freq))
    print("n_units: {}".format(n_units))
    print("epoch_size: {}".format(epoch_size))
    print("batch_size: {}".format(batch_size))
    print("dropout: {}".format(dropout))
    print("##############")

    # 辞書
    if os.path.exists(dict_file):
        dictionary = load_dictionary(dict_file)
    else:
        from util import create_dictionary
        dictionary = create_dictionary([sent_file], min_freq=min_freq)
        dictionary.save(dict_file)

    # Prepare encoder RNN model
    dim = len(dictionary.keys())
    model_type = args.type
    if model_type == "relu":
        model = relu_rnn.Classifier(
            relu_rnn.ReLURNN(embed_dim=dim, n_units=n_units, gpu=args.gpu))
    elif model_type == "lstm":
        import lstm
        model = lstm.Classifier(
            lstm.LSTM(embed_dim=dim, n_units=n_units, gpu=args.gpu))