def retrieve(): import pickle as pkl print('retrieve...', file=sys.stderr) #1. read dictionary dictionary = Dictionary() dictionary.load_from_galago_dump(args.dict_file, args.dict_min_freq) #2. make snrm instance & load weight device = torch.device('cpu') snrm = SNRM(args).to(device) snrm.load_state_dict(torch.load(args.model_file)) ## load model snrm.eval() ## set inference mode #3. read train data q_data = Triplet('query', args, dictionary) #4. read index inverted_index = InMemoryInvertedIndex(args.conv3_channel) inverted_index.load(args.index_file) #5. read data db_loader = DataLoader(dataset=q_data, batch_size=1, shuffle=False, num_workers=0) #6. retrieve with torch.no_grad(): result = dict() for k, (q_id, query) in enumerate(db_loader): query_repr = snrm(query.float()) query_repr = query_repr.numpy() retrieval_scores = dict() for i in range(len(query_repr[0])): if query_repr[0][i] > 0.: doc_rank = 0 for (did, weight) in inverted_index.index[i]: #print('did=', did) #print('weight=', weight) docid = did[0] if docid not in retrieval_scores: retrieval_scores[docid] = 0. retrieval_scores[docid] += query_repr[0][i] * weight doc_rank += 1 if (k % 10 == 0): print(k, ' query retrieved \r', file=sys.stderr, end='') #break qid = q_id[0] result[qid] = sorted(retrieval_scores.items(), key=lambda x: x[1]) print('qid=', qid) print('result=', result[qid]) pkl.dump(result, open(args.retrieve_result_file, 'wb')) print('>save result: ', args.retrieve_result_file, file=sys.stderr)
def build_index(): print('build index..', file=sys.stderr) #1. read dictionary dictionary = Dictionary() dictionary.load_from_galago_dump(args.dict_file, args.dict_min_freq) #2. make snrm instance & load weight device = torch.device('cpu') snrm = SNRM(args).to(device) snrm.load_state_dict(torch.load(args.model_file)) ## load model snrm.eval() ## set inference mode #3. read train data doc_data = Triplet('doc', args, dictionary) #4. make index db_loader = DataLoader(dataset=doc_data, batch_size=1, shuffle=False, num_workers=0) inverted_index = InMemoryInvertedIndex( args.conv3_channel) ## last channel is output representation with torch.no_grad(): for i, (doc_id, doc) in enumerate(db_loader): doc_repr = snrm(doc.float()) inverted_index.add(doc_id.numpy(), doc_repr.numpy()) if (i % 10 == 0): print(i, ' document inferenced \r', file=sys.stderr, end='') inverted_index.store(args.index_file) print('>save index: ', args.index_file, file=sys.stderr)
max_doc_len=FLAGS.max_doc_len, emb_dim=FLAGS.emb_dim, layer_size=layer_size, dropout_parameter=FLAGS.dropout_parameter, regularization_term=FLAGS.regularization_term, learning_rate=FLAGS.learning_rate) # step = 0 if not os.path.exists(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + "-inverted-index"): os.mkdir(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + "-inverted-index") inverted_index = InMemoryInvertedIndex(layer_size[-1]) while not check_gpu_available(): time.sleep(1) batch_index_id = 0 with tf.Session(graph=snrm.graph) as session: session.run(snrm.init) print('Initialized') model_index = "68000" # my trained "model/nladuo-snrm2000d44000.data-00000-of-00001" snrm.saver.restore(session, FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + model_index) # restore all variables logging.info( 'Load model from {:s}'.format(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + model_index))
print("creating SNRM model...") # The SNRM model. snrm = SNRM(dictionary=dictionary, pre_trained_embedding_file_name=FLAGS.base_path + FLAGS.pre_trained_embedding_file_name, batch_size=FLAGS.batch_size, max_q_len=FLAGS.max_q_len, max_doc_len=FLAGS.max_doc_len, emb_dim=FLAGS.emb_dim, layer_size=layer_size, dropout_parameter=FLAGS.dropout_parameter, regularization_term=FLAGS.regularization_term, learning_rate=FLAGS.learning_rate) inverted_index = InMemoryInvertedIndex(layer_size[-1]) inverted_index.load(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + '-inverted-index-20190812.pkl') with tf.Session(graph=snrm.graph) as session: session.run(snrm.init) print('Initialized') model_index = "9994000" # my trained "model/nladuo-snrm2000d44000.data-00000-of-00001" snrm.saver.restore(session, FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + model_index) # restore all variables logging.info( 'Load model from {:s}'.format(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + model_index)) client = pymongo.MongoClient()
max_doc_len=FLAGS.max_doc_len, emb_dim=FLAGS.emb_dim, layer_size=layer_size, dropout_parameter=FLAGS.dropout_parameter, regularization_term=FLAGS.regularization_term, learning_rate=FLAGS.learning_rate) # step = 0 if not os.path.exists(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + "-inverted-index"): os.mkdir(FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + "-inverted-index") inverted_index = InMemoryInvertedIndex(layer_size[-1]) # while not check_gpu_available(): # time.sleep(1) batch_index_id = 0 index_id = 0 with tf.Session(graph=snrm.graph) as session: session.run(snrm.init) print('Initialized') model_index = "9994000" # my trained "model/nladuo-snrm2000d44000.data-00000-of-00001" snrm.saver.restore(session, FLAGS.base_path + FLAGS.model_path + FLAGS.run_name + model_index) # restore all variables logging.info( 'Load model from {:s}'.format(FLAGS.base_path + FLAGS.model_path +