def __init__(self): dk.set_default('corenlp_classpath', corenlp_path) dr.set_default('model', model_path) # DrQA retriever self.retriever = ret.get_class('tfidf')(tfidf_path=tfidf_path) # DrQA reader self.reader = dr.Predictor(model_path, "corenlp", normalize=True) # Answerability classifier self.tokenizer = BertTokenizer.from_pretrained( model_name, do_lower_case="uncased" in model_name) # , cache_dir=cache_directory) self.pretrained_model = Model() checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) self.pretrained_model.load_state_dict(checkpoint['state_dict']) self.pretrained_model.zero_grad() self.pretrained_model.eval() self.pretrained_model.freeze() torch.set_grad_enabled(False) # Creates a map from document id to self.docs_txt = {} with open(docs_json_path, encoding='utf-8') as docs_text: for line in docs_text: line = eval(line) self.docs_txt[line["id"]] = line["text"]
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) init(tok_class, db_class, db_opts) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in map(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def __init__(self, predictor, rankerPath, dbPath, ebdPath=None): self.predictor = predictor self.ranker = retriever.get_class('tfidf')(tfidf_path=rankerPath) conn = sqlite3.connect(dbPath) self.db = conn.cursor() self.filter = filtText('drqa/features/map.txt') self.score = contextScore(ebdPath)
def __init__(self, db_path, model): ''' Args: model: tfidf model path ''' self.doc_db = retriever.DocDB(db_path=db_path) self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
def rank(args): logger.info('Initializing ranker...') ranker = retriever.get_class('tfidf')(tfidf_path=args.model) basename = os.path.splitext(os.path.basename(args.data_path))[0] dump_path = os.path.join(args.out_dir, f'{basename}-{args.k}.rank') logger.info(f'Dumping rank jsons to {dump_path}') with io.open(args.data_path) as json_file: for idx, line in enumerate(json_file): input_json = json.loads(line.strip('\n')) doc_id, doc = input_json['id'], input_json['text'] doc_names, doc_scores = ranker.closest_docs(query=doc, k=args.k) dump_json = { 'doc_id': doc_id, 'rank_ids': list(doc_names), 'rank_scores': list(doc_scores), } json_str = json.dumps(dump_json, ensure_ascii=False) with open(dump_path, 'a') as f: f.write(json_str + '\n') if idx and idx % 1000 == 0: logger.info(f'\t{idx} finished...') logger.info(f'\tExample: {json_str}')
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: #doc_ids = doc_db.get_doc_ids() doc_ids = [] res = es.search(index="htts", doc_type="htts", body={"size":500,"query": {"match_all": {}}},scroll='10m') scroll = res['_scroll_id'] #logger.info(scroll) #for doc in res['hits']['hits']: #print("%s" % (doc['_source']['documentId'])) # doc_ids.append(doc['_source']['documentId']) #res2 = es.scroll(scroll_id = scroll, scroll = '1m') #for doc in res2['hits']['hits']: #print("%s" % (doc['_source']['documentId'])) # doc_ids.append(doc['_source']['documentId']) scroll_id = res['_scroll_id'] for ref in scrollr(es, scroll_id, extract_references): print(ref) doc_ids.append(ref) DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool( args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts) ) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix( (data, (row, col)), shape=(args.hash_size, len(doc_ids)) ) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def init(): global ranker, nlp, df_topic_keywords, lda_model, vectorizer print('Initializing app') ranker = retriever.get_class('tfidf')(tfidf_path=MODEL) print('ranker:', ranker) df_topic_keywords = pd.read_pickle(ROOT_DIR / 'model' / 'df_topic_keywords.pkl') lda_model = pickle.load(open(ROOT_DIR / 'model' / 'best_lda_model.pkl', 'rb')) vocabulary = pickle.load(open(ROOT_DIR / 'model' / 'tm_features.pkl', 'rb')) vectorizer = CountVectorizer(decode_error='replace', vocabulary=vocabulary) nlp = spacy.load('en', disable=['parser', 'ner'])
def __init__(self,tfidf_path, tokenizer, use_stopwords = False, qclassifier = None): Answerer.__init__(self,qclassifier) self.tokenizer = tokenizer self.ranker =retriever.get_class('tfidf')(tfidf_path=tfidf_path) self.stopwords = stopwords self.use_stopwords = use_stopwords
def my_sample_fever(): logger = logging.getLogger() dictConfig({ 'version': 1, 'formatters': { 'default': { 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', } }, 'handlers': { 'wsgi': { 'class': 'logging.StreamHandler', 'stream': 'ext://sys.stderr', 'formatter': 'default' } }, 'root': { 'level': 'INFO', 'handlers': ['wsgi'] }, 'allennlp': { 'level': 'INFO', 'handlers': ['wsgi'] }, }) logger.info("Columbia FEVER application") config = json.load( open(os.getenv("CONFIG_PATH", "configs/system_config.json"))) ner_predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz" ) google_config = GoogleConfig(**config['retrieval']['google']) ranker = retriever.get_class('tfidf')( tfidf_path=config['retrieval']['tfidf']['index']) predictors = {} for key in ('page_model', 'state_model'): path = config[key].pop('path') predictors[key] = ColumbiaPredictor(path, config['cuda_device'], **config[key]) # The prediction function that is passed to the web server for FEVER2.0 def predict(instances): predictions = getDocsSingle(instances, google_config, ner_predictor, ranker) for key in ('page_model', 'state_model'): predictions = list(predictors[key].predict(predictions)) return predictions return fever_web_api(predict)
def __init__(self, db, n_docs, n_sents, whole_docs, compat, model): super().__init__(db) self.n_docs = n_docs self.n_sents = n_sents self.whole_docs = whole_docs self.compat = compat self.ranker = retriever.get_class('tfidf')(tfidf_path=model) self.onlineranker_args = self.RankArgs() self.doc_titles = [ self.ranker.get_doc_id(i) for i in range(self.ranker.num_docs) ] self.ner_retriever = NER_Retriever(self.doc_titles)
def __init__(self, name, retriever_model, num_threads): super().__init__(name) self.num_threads = min(num_threads, int(multiprocessing.cpu_count())) # initialize a ranker per thread self.arguments = [] for id in tqdm(range(self.num_threads)): self.arguments.append({ "id": id, "ranker": retriever.get_class("tfidf")(tfidf_path=retriever_model), })
def __init__( self, db, model, max_page, max_sent, ): self.db = db self.n_docs = max_page self.n_sents = max_sent self.model = model self.ranker = retriever.get_class('tfidf')(tfidf_path=model) self.onlineranker_args = self.RankArgs()
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) #多线程编程 workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) #分批写入矩阵 batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] #partial:偏函数 """ 函数在执行时,要带上所有必要的参数进行调用。但是,有时参数可以在函数被调用之前提前获知。这种情况下,一个函数有 一个或多个参数预先就能用上,以便函数能用更少的参数进行调用。 """ _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # 5075182 logger.info('the number of docs is %s' % (len(DOC2IDX))) # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping......') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 24 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 24) k = 0 for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) k += 1 if k % 10000 == 0: logger.info('NO: %s is ......' % k) workers.close() workers.join() logger.info('Creating sparse matrix......') count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool( args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts) ) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix( (data, (row, col)), shape=(args.hash_size, len(doc_ids)) ) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix_sklearn(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} hashvec = HashingVectorizer(n_features=2**24, dtype=np.int8, ngram_range=(1, 2), norm=None, non_negative=True) chunk_size = 100000 texts = [] chunks = [] db = db_class(**db_opts) for i, doc_id in enumerate(doc_ids): #if i == 100000: break texts.append(db.get_doc_text(doc_id)) if i % chunk_size == 0: if i > 0: print(i, 'fitting hashvec...') chunks.append(hashvec.transform(texts)) del texts[:] chunks.append(hashvec.transform(texts)) count_matrix = sp.vstack(chunks) count_matrix = count_matrix.transpose() print(count_matrix.shape) print(count_matrix.dtype) return count_matrix, (DOC2IDX, doc_ids)
class MyTfidfDocRanker(retriever.get_class('tfidf')): def text2spvec(self, query, data_val=False): """Create a sparse tfidf-weighted word vector from query. tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5)) """ # Get hashed ngrams words = self.parse(utils.normalize(query)) wids = [utils.hash(w, self.hash_size) for w in words] if len(wids) == 0: if self.strict: raise RuntimeError('No valid word in: %s' % query) else: logger.warning('No valid word in: %s' % query) return sp.csr_matrix((1, self.hash_size)) # Count TF wids_unique, wids_counts = np.unique(wids, return_counts=True) tfs = np.log1p(wids_counts) # Count IDF Ns = self.doc_freqs[wids_unique] idfs = np.log((self.num_docs - Ns + 0.5) / (Ns + 0.5)) idfs[idfs < 0] = 0 # TF-IDF data = np.multiply(tfs, idfs) if data_val: return data, wids_unique # One row, sparse csr matrix indptr = np.array([0, len(wids_unique)]) spvec = sp.csr_matrix( (data, wids_unique, indptr), shape=(1, self.hash_size) ) return spvec
start = time.time() # read all the data and store it logger.info("Reading data ...") questions = [] answers = [] for line in open(args.dataset): data = json.loads(line) question = data["question"] answer = data["answer"] questions.append(question) answers.append(answer) # get the closest docs for each question. logger.info("Initializing ranker...") ranker = retriever.get_class("tfidf")(tfidf_path=args.model) logger.info("Ranking...") closest_docs = ranker.batch_closest_docs( questions, k=args.n_docs, num_workers=args.num_workers ) answers_docs = zip(answers, closest_docs) # define processes tok_class = tokenizers.get_class(args.tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {"db_path": args.doc_db} processes = ProcessPool( processes=args.num_workers, initializer=init,
import time import sqlite3 from drqa import retriever import numpy as np from functools import partial from concurrent.futures import ThreadPoolExecutor from itertools import chain import pandas as pd from drqa.retriever import utils import os import pandas as pd db = "/home/giuseppe/Scrivania/HLT_Project/Retriver/Process_gnq/gnq_articles.db" connection = sqlite3.connect(db, check_same_thread=False) tfidf = "/home/giuseppe/Scrivania/HLT_Project/Retriver/DrQA/gnq_articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz" ranker = retriever.get_class('tfidf')(tfidf_path=tfidf) qa_db = "/home/giuseppe/Scrivania/HLT_Project/Retriver/Process_gnq/gnq_qa.db" def get_doc_text(doc_id): """Fetch the raw text of the doc for 'doc_id'.""" cursor = connection.cursor() cursor.execute("SELECT text FROM documents WHERE id = ?", (utils.normalize(doc_id), )) result = cursor.fetchone() cursor.close() return result if result is None else result[0] def _split_doc(doc): """Given a doc, split it into chunks (by paragraph)."""
question = data['question'] answer = data['answer'] # Make sure the regex compiles if args.regex: try: re.compile(answer[0]) except BaseException: logger.warning('Regex failed to compile: %s' % answer) continue questions.append(question) answers.append(answer) # Get classes ranker_class = retriever.get_class(args.ranker) db_class = retriever.get_class(args.db) tokenizer_class = tokenizers.get_class(args.tokenizer) # Form options search_keys = ('regex', 'match_threshold', 'char_max', 'char_min', 'window_sz') opts = { 'ranker_class': retriever.get_class(args.ranker), 'tokenizer_class': tokenizers.get_class(args.tokenizer), 'db_class': retriever.get_class(args.db), 'search': {k: vars(args)[k] for k in search_keys}, } opts.update(vars(args)) # Process!
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class( db) # get specify db class instance to get documents with db_class(**db_opts) as doc_db: # context management doc_ids = doc_db.get_doc_ids() # get all doc ids ''' enumerate(list) wrap a list to dic as follow: list=['a','b','c'] enumerate(list)=dict{0: 'a', 1: 'b', 2: 'c'} so iterate enumerate(list) return two values:index(start from 0) and value of origin list ''' DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids) } # get doc to index maps from doc_ids # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) # get count of steps batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step) ] # calc the batch range of each step # redefine function signature. use some defaults args to wrap a function object and return # a callable object. # refer link:http://www.wklken.me/posts/2013/08/18/python-extra-functools.html _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): # three lists extend when each step row.extend( b_row ) # list[.../step..../step...] hash(n-gram(token from doc)) col.extend(b_col) # list[.../step..../step...] index of doc data.extend( b_data ) # list[.../step..../step...] value of count of n-gram(token from doc) workers.close() workers.join() logger.info('Creating sparse matrix...') '''生成的稀疏矩阵示例 hash(N-gram) --------------(col) index of doc | element=count(hash) | | | (row) ''' count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) # 将矩阵中实体元素相同的进行相加合并 count_matrix.sum_duplicates() # 输出矩阵,以及其他 return count_matrix, (DOC2IDX, doc_ids)
question = data['question'] answer = data['answer'] # Make sure the regex compiles if args.regex: try: re.compile(answer[0]) except BaseException: logger.warning('Regex failed to compile: %s' % answer) continue questions.append(question) answers.append(answer) # Get classes ranker_class = retriever.get_class(args.ranker) db_class = retriever.get_class(args.db) tokenizer_class = tokenizers.get_class(args.tokenizer) # Form options search_keys = ('regex', 'match_threshold', 'char_max', 'char_min', 'window_sz') opts = { 'ranker_class': retriever.get_class(args.ranker), 'tokenizer_class': tokenizers.get_class(args.tokenizer), 'db_class': retriever.get_class(args.db), 'search': {k: vars(args)[k] for k in search_keys}, } opts.update(vars(args))
def eval_model(db: FeverDocDB, args) -> Model: archive = load_archive(args.archive_file, cuda_device=args.cuda_device, overrides=args.overrides) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader(db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) while True: claim = input("enter claim (or q to quit) >>") if claim.lower() == "q": break ranker = retriever.get_class('tfidf')(tfidf_path=args.model) p_lines = [] pages, _ = ranker.closest_docs(claim, 5) for page in pages: lines = db.get_doc_lines(page) lines = [ line.split("\t")[1] if len(line.split("\t")[1]) > 1 else "" for line in lines.split("\n") ] p_lines.extend(zip(lines, [page] * len(lines), range(len(lines)))) scores = tf_idf_sim(claim, [pl[0] for pl in p_lines]) scores = list( zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines], [pl[0] for pl in p_lines])) scores = list(filter(lambda score: len(score[3].strip()), scores)) sentences_l = list( sorted(scores, reverse=True, key=lambda elem: elem[0])) sentences = [s[3] for s in sentences_l[:5]] evidence = " ".join(sentences) print("Best pages: {0}".format(repr(pages))) print("Evidence:") for idx, sentence in enumerate(sentences_l[:5]): print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0], sentence[1], sentence[3])) item = reader.text_to_instance(evidence, claim) prediction = model.forward_on_instance(item, args.cuda_device) cls = model.vocab._index_to_token["labels"][np.argmax( prediction["label_probs"])] print("PREDICTED: {0}".format(cls)) print()
def __init__(self, saved_model_path): self.ranker = retriever.get_class('tfidf')(tfidf_path=saved_model_path)
def process(ranker, query, k=1): doc_names, doc_scores = ranker.closest_docs(query, k) return doc_names if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--in-file', type=str) parser.add_argument('--out-file', type=str) parser.add_argument('--index', type=str) parser.add_argument('--count', type=int, default=1) args = parser.parse_args() k = args.count ranker = retriever.get_class('tfidf')(tfidf_path=args.index) with open(args.in_file) as f: with open(args.out_file, "w+") as f2: for line in tqdm(f.readlines()): line = json.loads(line) if line["label"] == "NOT ENOUGH INFO": pages = process(ranker, line['claim'], k=k) pp = list(pages) for idx, evidence_group in enumerate(line['evidence']): for evidence in evidence_group: if idx < len(pp): evidence[2] = pp[idx] evidence[3] = -1
question = data["question"] answer = data["answer"] # Make sure the regex compiles if args.regex: try: re.compile(answer[0]) except BaseException: logger.warning("Regex failed to compile: %s" % answer) continue questions.append(question) answers.append(answer) # Get classes ranker_class = retriever.get_class(args.ranker) db_class = retriever.get_class(args.db) tokenizer_class = tokenizers.get_class(args.tokenizer) # Form options search_keys = ("regex", "match_threshold", "char_max", "char_min", "window_sz") opts = { "ranker_class": retriever.get_class(args.ranker), "tokenizer_class": tokenizers.get_class(args.tokenizer), "db_class": retriever.get_class(args.db), "search": {k: vars(args)[k] for k in search_keys}, } opts.update(vars(args))
def __init__(self, db, n_docs, n_sents, model): super().__init__(db) self.n_docs = n_docs self.n_sents = n_sents self.ranker = retriever.get_class('tfidf')(tfidf_path=model) self.onlineranker_args = self.RankArgs()
def __init__(self, database, index, n_docs, n_sents): super().__init__(database) self.n_docs = n_docs self.n_sents = n_sents self.ranker = retriever.get_class('tfidf')(tfidf_path=index) self.onlineranker_args = self.RankArgs()
def __init__(self, db, k, model): self.db = db self.k = k self.model = model self.ranker = retriever.get_class('tfidf')(tfidf_path=self.model)
# read all the data and store it logger.info('Reading data ...') questions = [] answers = [] for line in open(args.dataset): data = json.loads(line) question = data['question'] answer = data['answer'] questions.append(question) answers.append(answer) # get the closest docs for each question. logger.info('Initializing ranker...') ranker = retriever.get_class('tfidf')(tfidf_path=args.model) logger.info('Ranking...') closest_docs = ranker.batch_closest_docs(questions, k=args.n_docs, num_workers=args.num_workers) ranker = [] tok_class = tokenizers.get_class(args.tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {'db_path': args.doc_db} PROCESS_TOK = tok_class(**tok_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = db_class(**db_opts) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
import logging from drqa import retriever logger = logging.getLogger() logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) logger.addHandler(console) parser = argparse.ArgumentParser() parser.add_argument('--model', type=str, default=None) args = parser.parse_args() logger.info('Initializing ranker...') ranker = retriever.get_class('tfidf')(tfidf_path=args.model) # ------------------------------------------------------------------------------ # Drop in to interactive # ------------------------------------------------------------------------------ def process(query, k=1): doc_names, doc_scores = ranker.closest_docs(query, k) table = prettytable.PrettyTable( ['Rank', 'Doc Id', 'Doc Score'] ) for i in range(len(doc_names)): table.add_row([i + 1, doc_names[i], '%.5g' % doc_scores[i]]) print(table)
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class( db) # drqa/retriever/__init__.py --> doc_db.py with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids( ) # Fetch all ids of docs stored in the db. DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids) } # store in {'3255': 0, '8902': 1, ...} # Setup worker pool tok_class = tokenizers.get_class( args.tokenizer ) # 'corenlp', drqa/tokenizers/__init__.py --> corenlp_tokenizer.py workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] # total 10 batches _count = partial( count, args.ngram, args.hash_size) # args.hash_size --> default=int(math.pow(2, 24)) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') """ csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)]) where ``data``, ``row_ind`` and ``col_ind`` satisfy the relationship ``a[row_ind[k], col_ind[k]] = data[k]``. Examples: >>> row = np.array([0, 0, 1, 2, 2, 2]) >>> col = np.array([0, 2, 2, 0, 1, 2]) >>> data = np.array([1, 2, 3, 4, 5, 6]) >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray() array([[1, 0, 2], [0, 0, 3], [4, 5, 6]]) count_matrix: shape=(args.hash_size, len(doc_ids)) doc_1 doc_2 ... doc_m word_1 [[1, 0, ... 2], word_2 [0, 0, ... 3], ... ... word_n [4, 5, ... 6]] i.e., (word_1, doc_m) denotes word 'word_1' appear 2 times in doc 'doc_m'. Reference: https://towardsdatascience.com/machine-learning-to-big-data-scaling-inverted-indexing-with-solr-ba5b48833fb4 """ count_matrix = sp.csr_matrix( # import scipy.sparse as sp (data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def configurate_server(server, tfidf_path): server.handler_params = { "ranker": get_class("tfidf")(tfidf_path=tfidf_path, strict=False) }