def preprocess_squad(file_name, output_name, skip_answer=False): data = load_squad(file_name) tokenizer_class = tokenizers.get_class('spacy') make_pool = partial(Pool, 8, initializer=init) workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}})) q_tokens = workers.map(tokenize, data['questions']) workers.close() workers.join() workers = make_pool( initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}}) ) c_tokens = workers.map(tokenize, data['contexts']) workers.close() workers.join() examples = [] for idx in range(len(data['qids'])): question = q_tokens[idx]['words'] qlemma = q_tokens[idx]['lemma'] document = c_tokens[data['qid2cid'][idx]]['words'] offsets = c_tokens[data['qid2cid'][idx]]['offsets'] lemma = c_tokens[data['qid2cid'][idx]]['lemma'] pos = c_tokens[data['qid2cid'][idx]]['pos'] ner = c_tokens[data['qid2cid'][idx]]['ner'] ans_tokens = [] if len(data['answers']) > 0: for ans in data['answers'][idx]: found = find_answer(offsets, ans['answer_start'], ans['answer_start'] + len(ans['text'])) if found: ans_tokens.append(found) examples.append( { 'id': data['qids'][idx], 'question': question, 'document': document, 'offsets': offsets, 'answers': ans_tokens, 'qlemma': qlemma, 'lemma': lemma, 'pos': pos, 'ner': ner, }) json.dump(examples, open(output_name, 'w'))
def __init__(self, tfidf_path=None, strict=True): """ Args: tfidf_path: path to saved model file strict: fail on empty queries or continue (and return empty result) """ # Load from disk tfidf_path = tfidf_path logger.info('Loading %s' % tfidf_path) matrix, metadata = utils.load_sparse_csr(tfidf_path) self.doc_mat = matrix self.ngrams = metadata['ngram'] self.hash_size = metadata['hash_size'] self.tokenizer = tokenizers.get_class(metadata['tokenizer'])() self.doc_freqs = metadata['doc_freqs'].squeeze() self.doc_dict = metadata['doc_dict'] self.num_docs = len(self.doc_dict[0]) self.strict = strict
def get_count_matrix(args, db, db_opts): """Form a sparse word to document count matrix (inverted index). M[i, j] = # times word i appears in document j. """ # Map doc_ids to indexes global DOC2IDX db_class = retriever.get_class(db) with db_class(**db_opts) as doc_db: doc_ids = doc_db.get_doc_ids() DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)} # Setup worker pool tok_class = tokenizers.get_class(args.tokenizer) workers = ProcessPool(args.num_workers, initializer=init, initargs=(tok_class, db_class, db_opts)) # Compute the count matrix in steps (to keep in memory) logger.info('Mapping...') row, col, data = [], [], [] step = max(int(len(doc_ids) / 10), 1) batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)] _count = partial(count, args.ngram, args.hash_size) for i, batch in enumerate(batches): logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25) for b_row, b_col, b_data in workers.imap_unordered(_count, batch): row.extend(b_row) col.extend(b_col) data.extend(b_data) workers.close() workers.join() logger.info('Creating sparse matrix...') count_matrix = sp.csr_matrix((data, (row, col)), shape=(args.hash_size, len(doc_ids))) count_matrix.sum_duplicates() return count_matrix, (DOC2IDX, doc_ids)
def __init__(self, model=None, tokenizer=None, embedding_file=None, num_workers=None, normalize=True): """ Args: model: path to saved model file. tokenizer: option string to select tokenizer class. normalize: squash output score to 0-1 probabilities with a softmax. embedding_file: if provided, will expand dictionary to use all available pretrained vectors in this file. num_workers: number of CPU processes to use to preprocess batches. """ logger.info('Initializing model...') self.model = DocReader.load(model or DEFAULTS['model'], normalize=normalize) if embedding_file: logger.info('Expanding dictionary...') words = utils.index_embedding_words(embedding_file) added = self.model.expand_dictionary(words) self.model.load_embeddings(added, embedding_file) logger.info('Initializing tokenizer...') annotators = tokenizers.get_annotators_for_model(self.model) if not tokenizer: tokenizer_class = DEFAULTS['tokenizer'] else: tokenizer_class = tokenizers.get_class(tokenizer) if num_workers is None or num_workers > 0: self.workers = ProcessPool( num_workers, initializer=init, initargs=(tokenizer_class, annotators), ) else: self.workers = None self.tokenizer = tokenizer_class(annotators=annotators)
import unicodedata from tqdm import tqdm import pickle sys_dir = './' sys.path.append(sys_dir) import json import tokenizers from multiprocessing.util import Finalize tokenizers.set_default('corenlp_classpath', sys_dir + '/data/corenlp/*') import string import regex as re tok_class = tokenizers.get_class("corenlp") tok_opts = {} PROCESS_TOK = tok_class(**tok_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) def tokenizer_text(my_str='', uncased=True): ''' :param my_str: string :param cased: bool :return: list[str] ''' text = unicodedata.normalize('NFD', my_str) answer = PROCESS_TOK.tokenize(text) if uncased == True:
if __name__ == '__main__': client = pymongo.MongoClient(host='192.168.1.145', port=27017) db = client['AttnReader'] collection_train = db[args.collection_train_name] collection_val = db[args.collection_val_name] if args.raw_data: train = load_origin_data(collection_train, args, single_answer=args.single_answer_train) dev = load_origin_data(collection_val, args, single_answer=False) else: tokenizer = tokenizers.get_class(args.tokenizer)() train = load_data_tokenize(collection_train, tokenizer, args, single_answer=args.single_answer_train) dev = load_data_tokenize(collection_val, tokenizer, args, single_answer=False) if args.tokenizer == 'ltp': tokenizer.release() context_ents = list(train.netags) + list(dev.netags) context_tags = list(train.postags) + list(dev.postags) question_tokens = list(train.query_words) + list(dev.query_words)