def search(queries_path, dictionary_path, postings_path, output_path): """ Searches dictionary and postings for patents that matches the queries """ global patent_info, dictionary dictionary = read_dictionary(dictionary_path) patent_info = util.load_dictionary(PATENT_INFO_PATH) query = Query(queries_path, dictionary, patent_info) initial_ranked_docs = query.get_ranked_docs() expanded_query = ExpandedQuery(query, initial_ranked_docs, patent_info) result = ' '.join(expanded_query.get_ranked_docs()) with codecs.open(output_path, 'w', encoding='utf-8') as o: o.write(result) return result
def idf(): word_doc_freq = {} util.print_message('Start counting idf...', debug=True) tag_dict = util.load_dictionary(settings.DATA_PATH, settings.USER_DICT) file_names = util.get_file_list(settings.WORD_COUNT_FILE_PATH) for file_name in file_names: util.print_message('Processing all word count on {0}', arg=file_name) word_count_dict = util.file2dict(settings.WORD_COUNT_FILE_PATH, file_name) for key in word_count_dict.iterkeys(): if not tag_dict.has_key(key): continue if word_doc_freq.has_key(key): word_doc_freq[key] = word_doc_freq[key] + 1 else: word_doc_freq[key] = 1 util.save_sorted_dict(settings.DATA_PATH, settings.WDF_FILE, word_doc_freq) doc_number = len(file_names) inverse_doc_freq = {k: math.log(float(doc_number) / (1 + v)) for k, v in word_doc_freq.items()} util.save_sorted_dict(settings.DATA_PATH, settings.IDF_FILE, inverse_doc_freq) return inverse_doc_freq
def create_codes_by_fragment(codes, codeword_fragment, wordlist_directory): logger.info("Codeword fragment {}".format(codeword_fragment)) logger.debug("Loading wordlist...") wordlist = util.load_dictionary(wordlist_directory) wordlist_sorted = [''.join(sorted(s)) for s in wordlist] logger.info("Wordlist size: {} words".format(len(wordlist))) codewords = word.find_codewords(codeword_fragment, wordlist) if len(codewords) == 0: return [] elif len(codewords) > 10: logger.info("Possible codewords count: {}".format(len(codewords))) else: logger.info("Possible codewords: {}".format(", ".join(codewords))) logger.info("Creating codes based on codewords") process_count = min(len(codewords), os.cpu_count()) input_queue = mp.Queue() output_queue = mp.Queue() worker_args = (codes, wordlist, wordlist_sorted, input_queue, output_queue) logger.info("Using {} subprocesses".format(process_count)) processes = [ mp.Process(target=worker, args=worker_args) for _ in range(process_count) ] [input_queue.put(cw) for cw in codewords] [p.start() for p in processes] results = [] code_set = set() start_time = time.time() last_print = start_time start_words = len(codewords) while input_queue.qsize() > 0 or output_queue.qsize() > 0: try: result = output_queue.get(timeout=3) if not (result.code in code_set): code_set.add(result.code) results.append(result) except queue.Empty: pass if time.time() - last_print > 3: last_print = time.time() words_left = input_queue.qsize() done_count = start_words - words_left uptime = max(1.0, time.time() - start_time) words_per_second = max(1.0, done_count / uptime) eta = round(words_left / words_per_second) if eta >= 60: time_left = "{:02}m {:02}s".format(round(eta / 60), eta % 60) else: time_left = "{:02}s".format(eta) logger.info( "Codewords left: {:6} | Rate: {:4} Words/s | ETA: {} | Codes found: {}" .format(words_left, round(words_per_second), time_left, len(results))) logger.info("Finished code creation, waiting for subprocesses to stop...") [p.join() for p in processes] logger.debug("All subprocesses stopped") return results
print("model_dir: {}".format(model_dir)) print("sent_file: {}".format(sent_file)) print("conv_file: {}".format(conv_file)) print("dict_file: {}".format(dict_file)) print("word2vec_init: {}".format(word2vec_init)) print("word2vec_model_file: {}".format(word2vec_model_file)) print("min_freq: {}".format(min_freq)) print("n_units: {}".format(n_units)) print("epoch_size: {}".format(epoch_size)) print("batch_size: {}".format(batch_size)) print("dropout: {}".format(dropout)) print("##############") # 辞書 if os.path.exists(dict_file): dictionary = load_dictionary(dict_file) else: from util import create_dictionary dictionary = create_dictionary( [sent_file], min_freq=min_freq ) dictionary.save(dict_file) # Prepare encoder RNN model dim = len(dictionary.keys()) model_type = args.type if model_type == "relu": model = relu_rnn.Classifier( relu_rnn.ReLURNN( embed_dim=dim,
gpu_flag = True if args.gpu >= 0 else False config_file = args.config_file parser_config = configparser.ConfigParser() parser_config.read(config_file) config = parser_config["CONFIG"] # config["SEPARATOR"] = bytes( # config["DEFAULT"]["SEPARATOR"], "utf-8" # ).decode("unicode_escape") # params model_dir = config["model_dir"] n_units = int(config["n_units"]) # load conversation sentences dictionary = load_dictionary(config["dict_file"]) # Prepare encoder RNN model dim = len(dictionary.keys()) model_type = args.type if model_type == "relu": import relu_rnn model = relu_rnn.Classifier( relu_rnn.ReLURNN( embed_dim=dim, n_units=int(config["n_units"]), gpu=args.gpu ) ) elif model_type == "lstm": import lstm
print("model_dir: {}".format(model_dir)) print("sent_file: {}".format(sent_file)) print("conv_file: {}".format(conv_file)) print("dict_file: {}".format(dict_file)) print("word2vec_init: {}".format(word2vec_init)) print("word2vec_model_file: {}".format(word2vec_model_file)) print("min_freq: {}".format(min_freq)) print("n_units: {}".format(n_units)) print("epoch_size: {}".format(epoch_size)) print("batch_size: {}".format(batch_size)) print("dropout: {}".format(dropout)) print("##############") # 辞書 if os.path.exists(dict_file): dictionary = load_dictionary(dict_file) else: from util import create_dictionary dictionary = create_dictionary([sent_file], min_freq=min_freq) dictionary.save(dict_file) # Prepare encoder RNN model dim = len(dictionary.keys()) model_type = args.type if model_type == "relu": model = relu_rnn.Classifier( relu_rnn.ReLURNN(embed_dim=dim, n_units=n_units, gpu=args.gpu)) elif model_type == "lstm": import lstm model = lstm.Classifier( lstm.LSTM(embed_dim=dim, n_units=n_units, gpu=args.gpu))