def gen_query(question_): normalized = utils.normalize(question_) tokenizer = tokenizers.get_class('simple')() tokens = tokenizer.tokenize(normalized) words = tokens.ngrams(n=1, uncased=True, filter_fn=utils.filter_ngram) query_ = ' '.join(words) return query_
def __init__(self, args, lines, freqs = None, strict=True): """ Args: tfidf_path: path to saved model file strict: fail on empty queries or continue (and return empty result) """ # Load from disk logging.info('Counting words...') count_matrix, doc_dict = get_count_matrix( args, 'memory', {'lines': lines} ) logger.info('Making tfidf vectors...') tfidf = get_tfidf_matrix(count_matrix) if freqs is None: logger.info('Getting word-doc frequencies...') freqs = get_doc_freqs(count_matrix) metadata = { 'doc_freqs': freqs, 'tokenizer': args.tokenizer, 'hash_size': args.hash_size, 'ngram': args.ngram, 'doc_dict': doc_dict } self.doc_mat = tfidf self.ngrams = metadata['ngram'] self.hash_size = metadata['hash_size'] self.tokenizer = tokenizers.get_class(metadata['tokenizer'])() self.doc_freqs = metadata['doc_freqs'].squeeze() self.doc_dict = metadata['doc_dict'] self.num_docs = len(self.doc_dict[0]) self.strict = strict
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) init(tok_class, db_class, db_opts) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in map(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: #doc_ids = doc_db.get_doc_ids() doc_ids = [] res = es.search(index="htts", doc_type="htts", body={"size":500,"query": {"match_all": {}}},scroll='10m') scroll = res['_scroll_id'] #logger.info(scroll) #for doc in res['hits']['hits']: #print("%s" % (doc['_source']['documentId'])) # doc_ids.append(doc['_source']['documentId']) #res2 = es.scroll(scroll_id = scroll, scroll = '1m') #for doc in res2['hits']['hits']: #print("%s" % (doc['_source']['documentId'])) # doc_ids.append(doc['_source']['documentId']) scroll_id = res['_scroll_id'] for ref in scrollr(es, scroll_id, extract_references): print(ref) doc_ids.append(ref) DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool( args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts) ) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix( (data, (row, col)), shape=(args.hash_size, len(doc_ids)) ) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" print("tokenize questions ") tokenizer_class = tokenizers.get_class(tokenizer) #TOK = my_init(tokenizer_class, {'annotators': {'lemma'}}) #q_tokens = [my_tokenize(TOK, x) for x in data['questions']] make_pool = partial(Pool, workers, initializer=init) workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) q_tokens = workers.map(tokenize, data['questions']) workers.close() workers.join() print("tokenize contexts ") workers = make_pool( initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}}) ) #TOK = my_init(tokenizer_class, {'annotators': {'lemma'}}) #c_tokens = [my_tokenize(TOK, x) for x in data['contexts']] c_tokens = workers.map(tokenize, data['contexts']) workers.close() workers.join() labels = [] for p, a in zip(data['contexts'], data['answers']): a = a[0]['text'] if a in p: labels.append(1) else: labels.append(0) for idx in range(len(data['qids'])): question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] if len(data['answers']) > 0: for ans in data['answers'][idx]: found = find_answer(offsets, ans['answer_start'], ans['answer_start'] + len(ans['text'])) #found = (0, 0) if found: ans_tokens.append(found) yield { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, 'label': labels[idx] }
def __init__(self, index_path=None): self.question = None self.index_path = index_path or DEFAULTS['lucene_index'] self.tokenizer = tokenizers.get_class('simple')() self.env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(Paths.get(self.index_path)) self.analyzer = StandardAnalyzer() # self.query_parser = MultiFieldQueryParser(["title", "text"], self.analyzer) self.searcher = IndexSearcher(DirectoryReader.open(directory))
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) init(tokenizer_class, {'annotators': {'lemma'}}) q_tokens = [] c_tokens = [] print("tokenizing questions ...") make_pool = partial(Pool, workers, initializer=init) workers1 = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) q_tokens = workers1.map(tokenize, data['questions']) workers1.close() workers1.join() print("tokenizing contexts ...") workers2 = make_pool(initargs=(tokenizer_class, { 'annotators': {'lemma', 'pos', 'ner'} })) c_tokens = workers2.map(tokenize, data['contexts']) workers2.close() workers2.join() assert (len(q_tokens) == len(c_tokens)) for idx in range(len(q_tokens)): if q_tokens[idx] == None or c_tokens[data['qid2cid'][idx]] == None: continue normal_question = q_tokens[idx]['normal_text'] normal_context = c_tokens[data['qid2cid'][idx]]['normal_text'] question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] ans = data['answers'][idx] # answer the text ans_start = data['contexts'][idx].find(ans) ans_end = ans_start + len(ans) found = find_answer(offsets, ans_start, ans_end) if found: ans_tokens.append(found) yield { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, 'normal_question': normal_question, 'normal_context': normal_context, 'text_answer': ans }
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) try: make_pool = partial(Pool, workers, initializer=init) workers = make_pool(initargs=(tokenizer_class, { 'annotators': {'lemma'} })) q_tokens = workers.map(tokenize, data['questions']) workers.close() workers.join() except Exception as e: print(e) try: workers = make_pool(initargs=(tokenizer_class, { 'annotators': {'lemma', 'pos', 'ner'} })) c_tokens = workers.map(tokenize, data['contexts']) workers.close() workers.join() except Exception as e: print(e) for idx in range(len(data['qids'])): question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] if len(data['answers']) > 0: for ans in data['answers'][idx]: found = find_answer(offsets, ans['answer_start'], ans['answer_start'] + len(ans['text'])) if found: ans_tokens.append(found) yield { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, }
def __init__(self, tfidf_path=None, strict=True): """ Args: tfidf_path: path to saved model file strict: fail on empty queries or continue (and return empty result) """ # Load from disk logger.info('Loading %s' % tfidf_path) matrix, metadata = utils.load_sparse_csr(tfidf_path) self.doc_mat = matrix self.ngrams = metadata['ngram'] self.hash_size = metadata['hash_size'] self.tokenizer = tokenizers.get_class(metadata['tokenizer'])() self.doc_freqs = metadata['doc_freqs'].squeeze() self.num_docs = self.doc_mat.shape[1] - 1 self.strict = strict
def live_count_matrix_t(args, cands): global PROCESS_TOK if PROCESS_TOK is None: PROCESS_TOK = tokenizers.get_class(args.tokenizer)() row, col, data = [], [], [] for i, c in enumerate(cands): cur_row, cur_col, cur_data = count_text(args.ngram, args.hash_size, i, c) row += cur_row col += cur_col data += cur_data count_matrix = torch.sparse.FloatTensor( torch.LongTensor([row, col]), torch.FloatTensor(data), torch.Size([args.hash_size, len(cands)])).coalesce() return count_matrix
def __init__(self, tokenizer='', ranker_config=None, db_config=None, n_doc=5, num_workers=1, convert_bs=48, ngram=2, distant=False, small=False): self.convert_bs = convert_bs self.small = small self.n_doc = n_doc self.tok_class = tokenizers.get_class( tokenizer) if tokenizer else DEFAULT_CONVERT_CONFIG['tokenizer'] self.annotators = set() self.tok_opts = {'annotators': self.annotators} self.ngram = ngram self.tokenizer = self.tok_class(**self.tok_opts) self.ranker_config = ranker_config if ranker_config else {} self.ranker_class = self.ranker_config.get( 'class', DEFAULT_CONVERT_CONFIG['ranker']) self.ranker_opt = self.ranker_config.get('ret_opt', {}) logger.info('Loading ranker {}'.format(self.ranker_class.__name__)) self.ranker = self.ranker_class(**self.ranker_opt) if hasattr(self.ranker, 'es'): self.db_config = ranker_config self.db_class = self.ranker_class self.db_opts = self.ranker_opts else: self.db_config = db_config or {} self.db_class = self.db_config.get('class', DEFAULT_CONVERT_CONFIG['db']) self.db_opts = self.db_config.get('db_opt', {}) logger.info('Initializing tokenizers and document retrievers...') self.num_workers = num_workers self.processes = ProcessPool(num_workers, initializer=init, initargs=(self.tok_class, self.tok_opts, self.db_class, self.db_opts)) self.distant = distant
def live_count_matrix(args, cands): global PROCESS_TOK if PROCESS_TOK is None: PROCESS_TOK = tokenizers.get_class(args.tokenizer)() row, col, data = [], [], [] for i, c in enumerate(cands): cur_row, cur_col, cur_data = count_text(args.ngram, args.hash_size, i, c) row += cur_row col += cur_col data += cur_data data, row, col = truncate(data, row, col) count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(cands))) count_matrix.sum_duplicates() return count_matrix
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) #多线程编程 workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) #分批写入矩阵 batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] #partial:偏函数 """ 函数在执行时,要带上所有必要的参数进行调用。但是,有时参数可以在函数被调用之前提前获知。这种情况下,一个函数有 一个或多个参数预先就能用上,以便函数能用更少的参数进行调用。 """ _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # 5075182 logger.info('the number of docs is %s' % (len(DOC2IDX))) # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping......') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 24 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 24) k = 0 for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) k += 1 if k % 10000 == 0: logger.info('NO: %s is ......' % k) workers.close() workers.join() logger.info('Creating sparse matrix......') count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix(args, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ global MAX_SZ with DocDB(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool( args.num_workers, initializer=init, initargs=(tok_class, db_opts) ) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) if len(data) > MAX_SZ: break if len(data) > MAX_SZ: logger.info('Reached max indexable size, breaking.') break workers.close() workers.join() logger.info('Creating sparse matrix...') data, row, col = truncate(data, row, col) count_matrix = sp.csr_matrix( (data, (row, col)), shape=(args.hash_size, len(doc_ids) + 1) ) count_matrix.sum_duplicates() return count_matrix
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) make_pool = partial(Pool, workers, initializer=init) workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) q_tokens = workers.map(tokenize, data['questions']) workers.close() workers.join() workers = make_pool( initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}}) ) c_tokens = workers.map(tokenize, data['contexts']) workers.close() workers.join() for idx in range(len(data['qids'])): question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] if len(data['answers']) > 0: for ans in data['answers'][idx]: found = find_answer(offsets, ans['answer_start'], ans['answer_start'] + len(ans['text'])) if found: ans_tokens.append(found) yield { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, }
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) make_pool = partial(Pool, workers, initializer=init) workers = make_pool(initargs=(tokenizer_class, {"annotators": {"lemma"}})) q_tokens = workers.map(tokenize, data["questions"]) workers.close() workers.join() workers = make_pool(initargs=(tokenizer_class, { "annotators": {"lemma", "pos", "ner"} })) c_tokens = workers.map(tokenize, data["contexts"]) workers.close() workers.join() for idx in range(len(data["qids"])): question = q_tokens[idx]["words"] qlemma = q_tokens[idx]["lemma"] document = c_tokens[data["qid2cid"][idx]]["words"] offsets = c_tokens[data["qid2cid"][idx]]["offsets"] lemma = c_tokens[data["qid2cid"][idx]]["lemma"] pos = c_tokens[data["qid2cid"][idx]]["pos"] ner = c_tokens[data["qid2cid"][idx]]["ner"] ans_tokens = [] if len(data["answers"]) > 0: for ans in data["answers"][idx]: found = find_answer(offsets, ans["answer_start"], ans["answer_start"] + len(ans["text"])) if found: ans_tokens.append(found) yield { "id": data["qids"][idx], "question": question, "document": document, "offsets": offsets, "answers": ans_tokens, "qlemma": qlemma, "lemma": lemma, "pos": pos, "ner": ner, }
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool( args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts) ) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix( (data, (row, col)), shape=(args.hash_size, len(doc_ids)) ) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix_t(args, db_opts): """Form a sparse word to document count matrix (inverted index, torch ver). M[i, j] = # times word i appears in document j. """ global MAX_SZ with DocDB(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool( args.num_workers, initializer=init, initargs=(tok_class, db_opts) ) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = torch.sparse.FloatTensor( torch.LongTensor([row, col]), torch.FloatTensor(data), torch.Size([args.hash_size, len(doc_ids) + 1]) ).coalesce() return count_matrix
def __init__(self, lucene_path=None, index_path=None, sim_function='lm'): self.question = None self.lucene_path = lucene_path or DEFAULTS['lucene_path'] self.index_path = index_path or DEFAULTS['lucene_index'] self.sim_func = sim_function self.tokenizer = tokenizers.get_class('simple')()
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class( db) # get specify db class instance to get documents with db_class(**db_opts) as doc_db: # context management doc_ids = doc_db.get_doc_ids() # get all doc ids ''' enumerate(list) wrap a list to dic as follow: list=['a','b','c'] enumerate(list)=dict{0: 'a', 1: 'b', 2: 'c'} so iterate enumerate(list) return two values:index(start from 0) and value of origin list ''' DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids) } # get doc to index maps from doc_ids # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) # get count of steps batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step) ] # calc the batch range of each step # redefine function signature. use some defaults args to wrap a function object and return # a callable object. # refer link:http://www.wklken.me/posts/2013/08/18/python-extra-functools.html _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): # three lists extend when each step row.extend( b_row ) # list[.../step..../step...] hash(n-gram(token from doc)) col.extend(b_col) # list[.../step..../step...] index of doc data.extend( b_data ) # list[.../step..../step...] value of count of n-gram(token from doc) workers.close() workers.join() logger.info('Creating sparse matrix...') '''生成的稀疏矩阵示例 hash(N-gram) --------------(col) index of doc | element=count(hash) | | | (row) ''' count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) # 将矩阵中实体元素相同的进行相加合并 count_matrix.sum_duplicates() # 输出矩阵,以及其他 return count_matrix, (DOC2IDX, doc_ids)
# Make sure the regex compiles if args.regex: try: re.compile(answer[0]) except BaseException: logger.warning('Regex failed to compile: %s' % answer) continue questions.append(question) answers.append(answer) # Get classes ranker_class = retriever.get_class(args.ranker) db_class = retriever.get_class(args.db) tokenizer_class = tokenizers.get_class(args.tokenizer) # Form options search_keys = ('regex', 'match_threshold', 'char_max', 'char_min', 'window_sz') opts = { 'ranker_class': retriever.get_class(args.ranker), 'tokenizer_class': tokenizers.get_class(args.tokenizer), 'db_class': retriever.get_class(args.db), 'search': {k: vars(args)[k] for k in search_keys}, } opts.update(vars(args)) # Process! outname = os.path.splitext(args.data_name)[0]
# Make sure the regex compiles if args.regex: try: re.compile(answer[0]) except BaseException: logger.warning('Regex failed to compile: %s' % answer) continue questions.append(question) answers.append(answer) # Get classes ranker_class = retriever.get_class(args.ranker) db_class = retriever.get_class(args.db) tokenizer_class = tokenizers.get_class(args.tokenizer) # Form options search_keys = ('regex', 'match_threshold', 'char_max', 'char_min', 'window_sz') opts = { 'ranker_class': retriever.get_class(args.ranker), 'tokenizer_class': tokenizers.get_class(args.tokenizer), 'db_class': retriever.get_class(args.db), 'search': {k: vars(args)[k] for k in search_keys}, } opts.update(vars(args)) # Process! outname = os.path.splitext(args.data_name)[0] outfile = os.path.join(args.out_dir, outname)
# Make sure the regex compiles if args.regex: try: re.compile(answer[0]) except BaseException: logger.warning("Regex failed to compile: %s" % answer) continue questions.append(question) answers.append(answer) # Get classes ranker_class = retriever.get_class(args.ranker) db_class = retriever.get_class(args.db) tokenizer_class = tokenizers.get_class(args.tokenizer) # Form options search_keys = ("regex", "match_threshold", "char_max", "char_min", "window_sz") opts = { "ranker_class": retriever.get_class(args.ranker), "tokenizer_class": tokenizers.get_class(args.tokenizer), "db_class": retriever.get_class(args.db), "search": {k: vars(args)[k] for k in search_keys}, } opts.update(vars(args)) # Process! outname = os.path.splitext(args.data_name)[0]
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) init(tokenizer_class, {'annotators': {'lemma'}}) q_tokens = [] c_tokens = [] print("tokenizing questions ...") make_pool = partial(Pool, workers, initializer=init) workers1 = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) q_tokens = workers1.map(tokenize, data['questions']) workers1.close() workers1.join() print("tokenizing contexts ...") workers2 = make_pool( initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}}) ) c_tokens = workers2.map(tokenize, data['contexts']) workers2.close() workers2.join() assert(len(q_tokens) == len(c_tokens)) for idx in range(len(q_tokens)): # for each question if q_tokens[idx] == None or c_tokens[data['qid2cid'][idx]] == None: continue normal_question = q_tokens[idx]['normal_text'] normal_context = c_tokens[data['qid2cid'][idx]]['normal_text'] question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] ans = data['answers'][idx] # answer the text ans_start = -2 ans_end = 0 ans_offsets = [] while ans_start != -1: ans_start = normal_context.find(ans, ans_end) ans_end = ans_start + len(ans) if ans_start == -1: break ans_offsets.append((ans_start, ans_end)) found = find_answer(offsets, ans_offsets) if found: ans_tokens.append(found) yield { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, 'normal_question': normal_question, 'normal_context': normal_context, 'text_answer': ans } else: print("Answer not found:----------------" "\n{}\n{} {}({},{})".format(normal_question,ans,normal_context[ans_start:ans_end], ans_start,ans_end)) print(', '.join(['{}({},{})'.format(normal_context[b:e], b, e) for (b,e) in offsets]))
return PROCESS_DB.get_doc_text(doc_id) def tokenize_text(text): global PROCESS_TOK return PROCESS_TOK.tokenize(text) def ids(toks, word_dict): return [str(word_dict.get(tok, 0)) for tok in toks] processes = ProcessPool( 30, initializer=init, initargs=(tokenizers.get_class('spacy'), {}, DocDB, { 'db_path': "/users/sulixin/relate_work/DrQA/DrQA/data/wikipedia/docs_para.db" })) def load_qa(fi): infp_qa = open(fi) questions = [] answers = [] for l in infp_qa: if not l.strip(): continue data = json.loads(l) questions.append(data['question']) answers.append(data['answer'])
answer = data['answer'] questions.append(question) answers.append(answer) # get the closest docs for each question. logger.info('Initializing ranker...') ranker = retriever.get_class('tfidf')(tfidf_path=args.model) logger.info('Ranking...') closest_docs = ranker.batch_closest_docs( questions, k=args.n_docs, num_workers=args.num_workers ) answers_docs = zip(answers, closest_docs) # define processes tok_class = tokenizers.get_class(args.tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {'db_path': args.doc_db} processes = ProcessPool( processes=args.num_workers, initializer=init, initargs=(tok_class, tok_opts, db_class, db_opts) ) # compute the scores for each pair, and print the statistics logger.info('Retrieving and computing scores...') get_score_partial = partial(get_score, match=args.match) scores = processes.map(get_score_partial, answers_docs) filename = os.path.basename(args.dataset)
def prova(risposta, doc_ids, match): for doc_id in doc_ids: if has_answer(risposta, doc_id, match): return 1 return 0 risultato = prova(risposta=risposta, doc_ids=doc_titles, match=match) for doc_id in doc_titles: if has_answer(risposta, doc_id, match): print(1) #################################################### tok_class = tokenizers.get_class(tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {'db_path': doc_db} processes = ProcessPool(processes=num_workers, initializer=init, initargs=(tok_class, tok_opts, db_class, db_opts)) # compute the scores for each pair, and print the statistics get_score_partial = partial(get_score, match=match) scores = processes.map(get_score_partial, answers_docs) filename = os.path.basename(dataset) stats = ("\n" + "-" * 50 + "\n" + "{filename}\n" + "Examples:\t\t\t{total}\n" + "Matches in top {k}:\t\t{m}\n" + "Match % in top {k}:\t\t{p:2.2f}\n" + "Total time:\t\t\t{t:2.4f} (s)\n").format(
question = data['question'] answer = data['answer'] questions.append(question) answers.append(answer) # get the closest docs for each question. logger.info('Initializing ranker...') ranker = retriever.get_class('tfidf')(tfidf_path=args.model) logger.info('Ranking...') closest_docs = ranker.batch_closest_docs(questions, k=args.n_docs, num_workers=args.num_workers) ranker = [] tok_class = tokenizers.get_class(args.tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {'db_path': args.doc_db} PROCESS_TOK = tok_class(**tok_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = db_class(**db_opts) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100) #answers_docs = rerankDocs(questions, answers, closest_docs, PROCESS_DB) answers_docs = zip(answers, closest_docs, questions) logger.info('Retrieving texts and computing scores...') has_answers = [] tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
def setUp(self): reader_model = 'data/reader/multitask.mdl' reader = DocReader.load(reader_model, normalize=False) tok_class = tokenizers.get_class('simple') init_tokenizer(tok_class) self.selector_ = ChoiceSelector(reader.word_dict, reader.network.embedding)
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) make_pool = partial(Pool, workers, initializer=init) workers = make_pool(initargs=(tokenizer_class, { 'annotators': {'lemma'}, 'classpath': "/home/bhargavi/robust_nlp/invariance/DrQA/data/corenlp/*" })) #workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) q_tokens = workers.map(tokenize, data['questions']) workers.close() workers.join() workers = make_pool( initargs=(tokenizer_class, { 'annotators': {'lemma', 'pos', 'ner'}, 'classpath': "/home/bhargavi/robust_nlp/invariance/DrQA/data/corenlp/*" }) # initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}}) ) c_tokens = workers.map(tokenize, data['contexts']) workers.close() workers.join() ## code to override Pool # init(tokenizer_class, {'annotators': {'lemma'}, 'classpath' : "/home/bhargavi/robust_nlp/invariance/DrQA/data/corenlp/*"}) # q_tokens = [] # for idx in range(len(data['questions'])): # q_tokens.append(tokenize(data['questions'][idx])) # c_tokens = [] # for idx in range(len(data['contexts'])): # c_tokens.append(tokenize(data['contexts'][idx])) for idx in range(len(data['qids'])): question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] context_sentence_boundaries = c_tokens[data['qid2cid'] [idx]]['sentence_boundaries'] ans_tokens = [] if len(data['answers']) > 0: for ans in data['answers'][idx]: found = find_answer(offsets, ans['answer_start'], ans['answer_start'] + len(ans['text'])) if found: ans_tokens.append(found) ## sentences ans_tokens_list = list(set(ans_tokens)) sentences = [] gold_sentence_ids = [] for s_idx, tup in enumerate(context_sentence_boundaries): for a in ans_tokens_list: if a[0] >= tup[0] and a[1] < tup[1]: gold_sentence_ids.append(s_idx) elif a[0] >= tup[0] and a[0] < tup[1] and a[1] >= tup[1]: gold_sentence_ids.append(s_idx) gold_sentence_ids.append(s_idx + 1) sentence = document[tup[0]:tup[1]] sentences.append(sentence) gold_sentence_ids_set = list(set(gold_sentence_ids)) if len(ans_tokens_list) == 0: print("No golden sentence available") ## gold_sentence_id yield { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, 'sentences': sentences, 'gold_sentence_ids': gold_sentence_ids_set, }
def process_dataset(data, tokenizer, workers=None): """Iterate processing (tokenize, parse, etc) dataset multithreaded.""" tokenizer_class = tokenizers.get_class(tokenizer) make_pool = partial(Pool, workers, initializer=init) workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) #debug # print(type(data)) # print(data['contexts']) c_tokens = workers.map(tokenize, data['contexts']) #c_tokens = tokenize(data['contexts']) # q_tokens = workers.map(tokenize, data['questions']) workers.close() workers.join() workers = make_pool( initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}}) ) c_tokens = workers.map(tokenize, data['contexts']) workers.close() workers.join() #debug #print('len of data[qids]: ' + str(len(data['qids']))) for idx in range(len(data['qids'])): question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] sent_offsets = c_tokens[data['qid2cid'][idx]]['sent_offsets']#add jyu lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] # debug #print('contexts: ' + str(len(str(data['contexts'])))) #print('document: ' + str(document)) #document_sentences = c_tokens[data['qid2cid'][idx]]['sentences'] #print('document_sentences: ' + document_sentences) #sys.exit() sent_offsets_distict = []#torch.IntTensor(1) sent_offsets_distict.append(sent_offsets[0]) for sent_offset in sent_offsets: #print(str(sent_offset)) #print(str(sent_offsets_distict[-1])) if sent_offsets_distict[-1][0] < sent_offset[0]: sent_offsets_distict.append(sent_offset) #sent_offsets_distict = torch.from_numpy(sent_offsets_distict) sent_offsets_distict_tensor = torch.from_numpy(np.asarray(sent_offsets_distict)) #print(type(offsets)) #print(type(sent_offsets_distict_tensor)) #print(type(sent_offsets_distict)) #sent_offsets_distict_tensor = torch.IntTensor(sent_offsets_distict_tensor) sent_index_offsets = [] sent_index_offset_cur = [] sent_index_offset_cur.append(0) for i in range(len(sent_offsets)-1): if sent_offsets[i] != sent_offsets[i+1]: sent_index_offset_cur.append(i) sent_index_offsets.append(sent_index_offset_cur) sent_index_offset_cur = [] sent_index_offset_cur.append(i+1) sent_index_offset_cur.append(len(sent_offsets) - 1) sent_index_offsets.append(sent_index_offset_cur) if len(data['answers']) > 0: for ans in data['answers'][idx]: found = find_answer_sentence(sent_offsets_distict_tensor, ans['answer_start']) ''' found = find_answer(offsets, ans['answer_start'], ans['answer_start'] + len(ans['text'])) ''' if found: ans_tokens.append(found) yield { 'id': data['qids'][idx], 'question': question, 'document': document, #'sent_offsets_duplicates':sent_offsets, 'sent_offsets':sent_index_offsets,#add for change into sentence level jyu 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, }
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class( db) # drqa/retriever/__init__.py --> doc_db.py with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids( ) # Fetch all ids of docs stored in the db. DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids) } # store in {'3255': 0, '8902': 1, ...} # Setup worker pool tok_class = tokenizers.get_class( args.tokenizer ) # 'corenlp', drqa/tokenizers/__init__.py --> corenlp_tokenizer.py workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] # total 10 batches _count = partial( count, args.ngram, args.hash_size) # args.hash_size --> default=int(math.pow(2, 24)) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') """ csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)]) where ``data``, ``row_ind`` and ``col_ind`` satisfy the relationship ``a[row_ind[k], col_ind[k]] = data[k]``. Examples: >>> row = np.array([0, 0, 1, 2, 2, 2]) >>> col = np.array([0, 2, 2, 0, 1, 2]) >>> data = np.array([1, 2, 3, 4, 5, 6]) >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray() array([[1, 0, 2], [0, 0, 3], [4, 5, 6]]) count_matrix: shape=(args.hash_size, len(doc_ids)) doc_1 doc_2 ... doc_m word_1 [[1, 0, ... 2], word_2 [0, 0, ... 3], ... ... word_n [4, 5, ... 6]] i.e., (word_1, doc_m) denotes word 'word_1' appear 2 times in doc 'doc_m'. Reference: https://towardsdatascience.com/machine-learning-to-big-data-scaling-inverted-indexing-with-solr-ba5b48833fb4 """ count_matrix = sp.csr_matrix( # import scipy.sparse as sp (data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
for paragraph in article['paragraphs']: for qa in paragraph['qas']: qids.append(qa['id']) questions.append(qa['question']) # ------------------------------------------------------------------------------ # Retrieve most relevant documents from dataset # ------------------------------------------------------------------------------ ranker = retriever.get_class('tfidf')(tfidf_path=args.retriever_model) retrieved_doc_ids = ranker.batch_closest_docs(questions, k=args.n_docs, num_workers=args.num_workers) # define processes tok_class = tokenizers.get_class(args.retriever_tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {'db_path': args.doc_db} processes = ProcessPool(processes=args.num_workers, initializer=init, initargs=(tok_class, tok_opts, db_class, db_opts)) contexts = processes.map(retrieve_documents, retrieved_doc_ids) examples = [] for i, question in enumerate(questions): context = contexts[i][0] if len(contexts[i]) > 0 else "_" examples.append((context, question)) # ------------------------------------------------------------------------------ # Read in dataset and make predictions.