Beispiel #1
0
def gen_query(question_):
    normalized = utils.normalize(question_)
    tokenizer = tokenizers.get_class('simple')()
    tokens = tokenizer.tokenize(normalized)
    words = tokens.ngrams(n=1, uncased=True, filter_fn=utils.filter_ngram)
    query_ = ' '.join(words)
    return query_
Beispiel #2
0
    def __init__(self, args, lines, freqs = None, strict=True):
        """
        Args:
            tfidf_path: path to saved model file
            strict: fail on empty queries or continue (and return empty result)
        """
        # Load from disk
        logging.info('Counting words...')
        count_matrix, doc_dict = get_count_matrix(
            args, 'memory', {'lines': lines}
        )

        logger.info('Making tfidf vectors...')
        tfidf = get_tfidf_matrix(count_matrix)

        if freqs is None:
            logger.info('Getting word-doc frequencies...')
            freqs = get_doc_freqs(count_matrix)

        metadata = {
            'doc_freqs': freqs,
            'tokenizer': args.tokenizer,
            'hash_size': args.hash_size,
            'ngram': args.ngram,
            'doc_dict': doc_dict
        }

        self.doc_mat = tfidf
        self.ngrams = metadata['ngram']
        self.hash_size = metadata['hash_size']
        self.tokenizer = tokenizers.get_class(metadata['tokenizer'])()
        self.doc_freqs = metadata['doc_freqs'].squeeze()
        self.doc_dict = metadata['doc_dict']
        self.num_docs = len(self.doc_dict[0])
        self.strict = strict
Beispiel #3
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    init(tok_class, db_class, db_opts)

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in map(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        #doc_ids = doc_db.get_doc_ids()
     doc_ids = [] 
    res = es.search(index="htts", doc_type="htts", body={"size":500,"query": {"match_all": {}}},scroll='10m')
    scroll = res['_scroll_id']
    #logger.info(scroll)
    #for doc in res['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])     
    #res2 = es.scroll(scroll_id = scroll, scroll = '1m')   
    
    #for doc in res2['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])   
    
    scroll_id = res['_scroll_id']
    for ref in scrollr(es, scroll_id, extract_references):
        print(ref)
        doc_ids.append(ref)
           
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Beispiel #5
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    print("tokenize questions ")
    tokenizer_class = tokenizers.get_class(tokenizer)
    #TOK = my_init(tokenizer_class, {'annotators': {'lemma'}})
    #q_tokens = [my_tokenize(TOK, x) for x in data['questions']]
    make_pool = partial(Pool, workers, initializer=init)
    workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    q_tokens = workers.map(tokenize, data['questions'])
    workers.close()
    workers.join()
    print("tokenize contexts ")
    workers = make_pool(
        initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}})
    )
    #TOK = my_init(tokenizer_class, {'annotators': {'lemma'}})
    #c_tokens = [my_tokenize(TOK, x) for x in data['contexts']]
    c_tokens = workers.map(tokenize, data['contexts'])
    workers.close()
    workers.join()

    labels = []
    for p, a in zip(data['contexts'], data['answers']):
        a = a[0]['text']
        if a in p:
            labels.append(1)
        else:
            labels.append(0)

    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer(offsets,
                                    ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                #found = (0, 0)
                if found:
                    ans_tokens.append(found)
        yield {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
            'label': labels[idx]
        }
Beispiel #6
0
    def __init__(self, index_path=None):
        self.question = None
        self.index_path = index_path or DEFAULTS['lucene_index']
        self.tokenizer = tokenizers.get_class('simple')()
        self.env = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        directory = SimpleFSDirectory(Paths.get(self.index_path))
        self.analyzer = StandardAnalyzer()
        # self.query_parser = MultiFieldQueryParser(["title", "text"], self.analyzer)

        self.searcher = IndexSearcher(DirectoryReader.open(directory))
Beispiel #7
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    init(tokenizer_class, {'annotators': {'lemma'}})
    q_tokens = []
    c_tokens = []
    print("tokenizing questions ...")
    make_pool = partial(Pool, workers, initializer=init)
    workers1 = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    q_tokens = workers1.map(tokenize, data['questions'])
    workers1.close()
    workers1.join()

    print("tokenizing contexts ...")
    workers2 = make_pool(initargs=(tokenizer_class, {
        'annotators': {'lemma', 'pos', 'ner'}
    }))
    c_tokens = workers2.map(tokenize, data['contexts'])
    workers2.close()
    workers2.join()
    assert (len(q_tokens) == len(c_tokens))
    for idx in range(len(q_tokens)):
        if q_tokens[idx] == None or c_tokens[data['qid2cid'][idx]] == None:
            continue
        normal_question = q_tokens[idx]['normal_text']
        normal_context = c_tokens[data['qid2cid'][idx]]['normal_text']
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        ans = data['answers'][idx]  # answer the text
        ans_start = data['contexts'][idx].find(ans)
        ans_end = ans_start + len(ans)
        found = find_answer(offsets, ans_start, ans_end)
        if found:
            ans_tokens.append(found)
            yield {
                'id': data['qids'][idx],
                'question': question,
                'document': document,
                'offsets': offsets,
                'answers': ans_tokens,
                'qlemma': qlemma,
                'lemma': lemma,
                'pos': pos,
                'ner': ner,
                'normal_question': normal_question,
                'normal_context': normal_context,
                'text_answer': ans
            }
Beispiel #8
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    try:
        make_pool = partial(Pool, workers, initializer=init)
        workers = make_pool(initargs=(tokenizer_class, {
            'annotators': {'lemma'}
        }))
        q_tokens = workers.map(tokenize, data['questions'])
        workers.close()
        workers.join()
    except Exception as e:
        print(e)

    try:
        workers = make_pool(initargs=(tokenizer_class, {
            'annotators': {'lemma', 'pos', 'ner'}
        }))
        c_tokens = workers.map(tokenize, data['contexts'])
        workers.close()
        workers.join()
    except Exception as e:
        print(e)

    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer(offsets, ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                if found:
                    ans_tokens.append(found)
        yield {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
        }
Beispiel #9
0
 def __init__(self, tfidf_path=None, strict=True):
     """
     Args:
         tfidf_path: path to saved model file
         strict: fail on empty queries or continue (and return empty result)
     """
     # Load from disk
     logger.info('Loading %s' % tfidf_path)
     matrix, metadata = utils.load_sparse_csr(tfidf_path)
     self.doc_mat = matrix
     self.ngrams = metadata['ngram']
     self.hash_size = metadata['hash_size']
     self.tokenizer = tokenizers.get_class(metadata['tokenizer'])()
     self.doc_freqs = metadata['doc_freqs'].squeeze()
     self.num_docs = self.doc_mat.shape[1] - 1
     self.strict = strict
Beispiel #10
0
def live_count_matrix_t(args, cands):
    global PROCESS_TOK
    if PROCESS_TOK is None:
        PROCESS_TOK = tokenizers.get_class(args.tokenizer)()
    row, col, data = [], [], []
    for i, c in enumerate(cands):
        cur_row, cur_col, cur_data = count_text(args.ngram, args.hash_size, i,
                                                c)
        row += cur_row
        col += cur_col
        data += cur_data

    count_matrix = torch.sparse.FloatTensor(
        torch.LongTensor([row, col]), torch.FloatTensor(data),
        torch.Size([args.hash_size, len(cands)])).coalesce()
    return count_matrix
    def __init__(self,
                 tokenizer='',
                 ranker_config=None,
                 db_config=None,
                 n_doc=5,
                 num_workers=1,
                 convert_bs=48,
                 ngram=2,
                 distant=False,
                 small=False):

        self.convert_bs = convert_bs
        self.small = small
        self.n_doc = n_doc
        self.tok_class = tokenizers.get_class(
            tokenizer) if tokenizer else DEFAULT_CONVERT_CONFIG['tokenizer']
        self.annotators = set()
        self.tok_opts = {'annotators': self.annotators}
        self.ngram = ngram
        self.tokenizer = self.tok_class(**self.tok_opts)

        self.ranker_config = ranker_config if ranker_config else {}
        self.ranker_class = self.ranker_config.get(
            'class', DEFAULT_CONVERT_CONFIG['ranker'])
        self.ranker_opt = self.ranker_config.get('ret_opt', {})
        logger.info('Loading ranker {}'.format(self.ranker_class.__name__))
        self.ranker = self.ranker_class(**self.ranker_opt)

        if hasattr(self.ranker, 'es'):
            self.db_config = ranker_config
            self.db_class = self.ranker_class
            self.db_opts = self.ranker_opts
        else:
            self.db_config = db_config or {}
            self.db_class = self.db_config.get('class',
                                               DEFAULT_CONVERT_CONFIG['db'])
            self.db_opts = self.db_config.get('db_opt', {})

        logger.info('Initializing tokenizers and document retrievers...')
        self.num_workers = num_workers
        self.processes = ProcessPool(num_workers,
                                     initializer=init,
                                     initargs=(self.tok_class, self.tok_opts,
                                               self.db_class, self.db_opts))

        self.distant = distant
Beispiel #12
0
def live_count_matrix(args, cands):
    global PROCESS_TOK
    if PROCESS_TOK is None:
        PROCESS_TOK = tokenizers.get_class(args.tokenizer)()
    row, col, data = [], [], []
    for i, c in enumerate(cands):
        cur_row, cur_col, cur_data = count_text(args.ngram, args.hash_size, i,
                                                c)
        row += cur_row
        col += cur_col
        data += cur_data

    data, row, col = truncate(data, row, col)
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(cands)))
    count_matrix.sum_duplicates()
    return count_matrix
Beispiel #13
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    #多线程编程
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))
    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    #分批写入矩阵
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    #partial:偏函数
    """
    函数在执行时,要带上所有必要的参数进行调用。但是,有时参数可以在函数被调用之前提前获知。这种情况下,一个函数有
    一个或多个参数预先就能用上,以便函数能用更少的参数进行调用。
    """
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Beispiel #14
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    # 5075182
    logger.info('the number of docs is %s' % (len(DOC2IDX)))

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping......')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 24 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 24)
        k = 0
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            k += 1
            if k % 10000 == 0:
                logger.info('NO: %s is ......' % k)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix......')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Beispiel #15
0
def get_count_matrix(args, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    global MAX_SZ
    with DocDB(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            if len(data) > MAX_SZ:
                break
        if len(data) > MAX_SZ:
            logger.info('Reached max indexable size, breaking.')
            break
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    data, row, col = truncate(data, row, col)

    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids) + 1)
    )
    count_matrix.sum_duplicates()
    return count_matrix
Beispiel #16
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    make_pool = partial(Pool, workers, initializer=init)
    workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    q_tokens = workers.map(tokenize, data['questions'])
    workers.close()
    workers.join()

    workers = make_pool(
        initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}})
    )
    c_tokens = workers.map(tokenize, data['contexts'])
    workers.close()
    workers.join()

    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer(offsets,
                                    ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                if found:
                    ans_tokens.append(found)
        yield {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
        }
Beispiel #17
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    make_pool = partial(Pool, workers, initializer=init)
    workers = make_pool(initargs=(tokenizer_class, {"annotators": {"lemma"}}))
    q_tokens = workers.map(tokenize, data["questions"])
    workers.close()
    workers.join()

    workers = make_pool(initargs=(tokenizer_class, {
        "annotators": {"lemma", "pos", "ner"}
    }))
    c_tokens = workers.map(tokenize, data["contexts"])
    workers.close()
    workers.join()

    for idx in range(len(data["qids"])):
        question = q_tokens[idx]["words"]
        qlemma = q_tokens[idx]["lemma"]
        document = c_tokens[data["qid2cid"][idx]]["words"]
        offsets = c_tokens[data["qid2cid"][idx]]["offsets"]
        lemma = c_tokens[data["qid2cid"][idx]]["lemma"]
        pos = c_tokens[data["qid2cid"][idx]]["pos"]
        ner = c_tokens[data["qid2cid"][idx]]["ner"]
        ans_tokens = []
        if len(data["answers"]) > 0:
            for ans in data["answers"][idx]:
                found = find_answer(offsets, ans["answer_start"],
                                    ans["answer_start"] + len(ans["text"]))
                if found:
                    ans_tokens.append(found)
        yield {
            "id": data["qids"][idx],
            "question": question,
            "document": document,
            "offsets": offsets,
            "answers": ans_tokens,
            "qlemma": qlemma,
            "lemma": lemma,
            "pos": pos,
            "ner": ner,
        }
Beispiel #18
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Beispiel #19
0
def get_count_matrix_t(args, db_opts):
    """Form a sparse word to document count matrix (inverted index, torch ver).

    M[i, j] = # times word i appears in document j.
    """
    global MAX_SZ
    with DocDB(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = torch.sparse.FloatTensor(
        torch.LongTensor([row, col]), torch.FloatTensor(data),
        torch.Size([args.hash_size, len(doc_ids) + 1])
    ).coalesce()
    return count_matrix
Beispiel #20
0
 def __init__(self, lucene_path=None, index_path=None, sim_function='lm'):
     self.question = None
     self.lucene_path = lucene_path or DEFAULTS['lucene_path']
     self.index_path = index_path or DEFAULTS['lucene_index']
     self.sim_func = sim_function
     self.tokenizer = tokenizers.get_class('simple')()
Beispiel #21
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(
        db)  # get specify db class instance to get documents
    with db_class(**db_opts) as doc_db:  # context management
        doc_ids = doc_db.get_doc_ids()  # get all doc ids
    '''
        enumerate(list) wrap a list to dic as follow:
        list=['a','b','c']
        enumerate(list)=dict{0: 'a', 1: 'b', 2: 'c'}
        
        so iterate enumerate(list) return two values:index(start from 0) and value of origin list
    '''
    DOC2IDX = {doc_id: i
               for i, doc_id in enumerate(doc_ids)
               }  # get doc to index maps from doc_ids

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)  # get count of steps
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)
               ]  # calc the batch range of each step

    # redefine function signature. use some defaults args to wrap a function object and return
    # a callable object.
    # refer link:http://www.wklken.me/posts/2013/08/18/python-extra-functools.html
    _count = partial(count, args.ngram, args.hash_size)

    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            # three lists extend when each step
            row.extend(
                b_row
            )  # list[.../step..../step...] hash(n-gram(token from doc))
            col.extend(b_col)  # list[.../step..../step...] index of doc
            data.extend(
                b_data
            )  # list[.../step..../step...] value of count of n-gram(token from doc)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    '''生成的稀疏矩阵示例
       hash(N-gram) --------------(col)
       index of doc | element=count(hash) 
                    |
                    |
                    |
                    (row)
    '''
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))

    # 将矩阵中实体元素相同的进行相加合并
    count_matrix.sum_duplicates()
    # 输出矩阵,以及其他
    return count_matrix, (DOC2IDX, doc_ids)
        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning('Regex failed to compile: %s' % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ('regex', 'match_threshold', 'char_max', 'char_min',
                   'window_sz')
    opts = {
        'ranker_class': retriever.get_class(args.ranker),
        'tokenizer_class': tokenizers.get_class(args.tokenizer),
        'db_class': retriever.get_class(args.db),
        'search': {k: vars(args)[k]
                   for k in search_keys},
    }
    opts.update(vars(args))

    # Process!
    outname = os.path.splitext(args.data_name)[0]
Beispiel #23
0
        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning('Regex failed to compile: %s' % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ('regex', 'match_threshold', 'char_max',
                   'char_min', 'window_sz')
    opts = {
        'ranker_class': retriever.get_class(args.ranker),
        'tokenizer_class': tokenizers.get_class(args.tokenizer),
        'db_class': retriever.get_class(args.db),
        'search': {k: vars(args)[k] for k in search_keys},
    }
    opts.update(vars(args))

    # Process!
    outname = os.path.splitext(args.data_name)[0]
    outfile = os.path.join(args.out_dir, outname)
Beispiel #24
0
        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning("Regex failed to compile: %s" % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ("regex", "match_threshold", "char_max", "char_min",
                   "window_sz")
    opts = {
        "ranker_class": retriever.get_class(args.ranker),
        "tokenizer_class": tokenizers.get_class(args.tokenizer),
        "db_class": retriever.get_class(args.db),
        "search": {k: vars(args)[k]
                   for k in search_keys},
    }
    opts.update(vars(args))

    # Process!
    outname = os.path.splitext(args.data_name)[0]
Beispiel #25
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    init(tokenizer_class,  {'annotators': {'lemma'}})
    q_tokens = []
    c_tokens = []
    print("tokenizing questions ...")
    make_pool = partial(Pool, workers, initializer=init)
    workers1 = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    q_tokens = workers1.map(tokenize, data['questions'])
    workers1.close()
    workers1.join()

    print("tokenizing contexts ...")
    workers2 = make_pool(
        initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}})
    )
    c_tokens = workers2.map(tokenize, data['contexts'])
    workers2.close()
    workers2.join()
    assert(len(q_tokens) == len(c_tokens))
    for idx in range(len(q_tokens)):                                            # for each question
        if q_tokens[idx] == None  or c_tokens[data['qid2cid'][idx]] == None:
            continue
        normal_question = q_tokens[idx]['normal_text']
        normal_context = c_tokens[data['qid2cid'][idx]]['normal_text']
        question = q_tokens[idx]['words']
        qlemma =   q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets =  c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos =   c_tokens[data['qid2cid'][idx]]['pos']
        ner =   c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        ans = data['answers'][idx]  # answer the text
        ans_start = -2
        ans_end = 0
        ans_offsets = []
        while ans_start != -1:
            ans_start = normal_context.find(ans, ans_end)
            ans_end = ans_start + len(ans)
            if ans_start == -1: break
            ans_offsets.append((ans_start, ans_end))
        found = find_answer(offsets, ans_offsets)
        if found:
            ans_tokens.append(found)
            yield {
              'id': data['qids'][idx],
              'question': question,
              'document': document,
              'offsets': offsets,
              'answers': ans_tokens,
              'qlemma': qlemma,
              'lemma': lemma,
              'pos': pos,
              'ner': ner,
              'normal_question': normal_question,
              'normal_context': normal_context,
              'text_answer': ans
            }
        else:
            print("Answer not found:----------------"
            "\n{}\n{} {}({},{})".format(normal_question,ans,normal_context[ans_start:ans_end], ans_start,ans_end))
            print(', '.join(['{}({},{})'.format(normal_context[b:e], b, e) for (b,e) in offsets]))
Beispiel #26
0
    return PROCESS_DB.get_doc_text(doc_id)


def tokenize_text(text):
    global PROCESS_TOK
    return PROCESS_TOK.tokenize(text)


def ids(toks, word_dict):
    return [str(word_dict.get(tok, 0)) for tok in toks]


processes = ProcessPool(
    30,
    initializer=init,
    initargs=(tokenizers.get_class('spacy'), {}, DocDB, {
        'db_path':
        "/users/sulixin/relate_work/DrQA/DrQA/data/wikipedia/docs_para.db"
    }))


def load_qa(fi):
    infp_qa = open(fi)
    questions = []
    answers = []
    for l in infp_qa:
        if not l.strip():
            continue
        data = json.loads(l)
        questions.append(data['question'])
        answers.append(data['answer'])
Beispiel #27
0
        answer = data['answer']
        questions.append(question)
        answers.append(answer)

    # get the closest docs for each question.
    logger.info('Initializing ranker...')
    ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

    logger.info('Ranking...')
    closest_docs = ranker.batch_closest_docs(
        questions, k=args.n_docs, num_workers=args.num_workers
    )
    answers_docs = zip(answers, closest_docs)

    # define processes
    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {'db_path': args.doc_db}
    processes = ProcessPool(
        processes=args.num_workers,
        initializer=init,
        initargs=(tok_class, tok_opts, db_class, db_opts)
    )

    # compute the scores for each pair, and print the statistics
    logger.info('Retrieving and computing scores...')
    get_score_partial = partial(get_score, match=args.match)
    scores = processes.map(get_score_partial, answers_docs)

    filename = os.path.basename(args.dataset)
Beispiel #28
0
Datei: eval.py Projekt: giuid/HLT
def prova(risposta, doc_ids, match):
    for doc_id in doc_ids:
        if has_answer(risposta, doc_id, match):
            return 1
    return 0


risultato = prova(risposta=risposta, doc_ids=doc_titles, match=match)

for doc_id in doc_titles:
    if has_answer(risposta, doc_id, match):
        print(1)

####################################################
tok_class = tokenizers.get_class(tokenizer)
tok_opts = {}
db_class = retriever.DocDB
db_opts = {'db_path': doc_db}
processes = ProcessPool(processes=num_workers,
                        initializer=init,
                        initargs=(tok_class, tok_opts, db_class, db_opts))

# compute the scores for each pair, and print the statistics
get_score_partial = partial(get_score, match=match)
scores = processes.map(get_score_partial, answers_docs)

filename = os.path.basename(dataset)
stats = ("\n" + "-" * 50 + "\n" + "{filename}\n" + "Examples:\t\t\t{total}\n" +
         "Matches in top {k}:\t\t{m}\n" + "Match % in top {k}:\t\t{p:2.2f}\n" +
         "Total time:\t\t\t{t:2.4f} (s)\n").format(
Beispiel #29
0
        question = data['question']
        answer = data['answer']
        questions.append(question)
        answers.append(answer)

    # get the closest docs for each question.
    logger.info('Initializing ranker...')
    ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

    logger.info('Ranking...')
    closest_docs = ranker.batch_closest_docs(questions,
                                             k=args.n_docs,
                                             num_workers=args.num_workers)
    ranker = []

    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {'db_path': args.doc_db}
    PROCESS_TOK = tok_class(**tok_opts)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)

    #answers_docs = rerankDocs(questions, answers, closest_docs, PROCESS_DB)
    answers_docs = zip(answers, closest_docs, questions)

    logger.info('Retrieving texts and computing scores...')
    has_answers = []

    tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
Beispiel #30
0
 def setUp(self):
     reader_model = 'data/reader/multitask.mdl'
     reader = DocReader.load(reader_model, normalize=False)
     tok_class = tokenizers.get_class('simple')
     init_tokenizer(tok_class)
     self.selector_ = ChoiceSelector(reader.word_dict, reader.network.embedding)
Beispiel #31
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    make_pool = partial(Pool, workers, initializer=init)

    workers = make_pool(initargs=(tokenizer_class, {
        'annotators': {'lemma'},
        'classpath':
        "/home/bhargavi/robust_nlp/invariance/DrQA/data/corenlp/*"
    }))
    #workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    q_tokens = workers.map(tokenize, data['questions'])
    workers.close()
    workers.join()

    workers = make_pool(
        initargs=(tokenizer_class, {
            'annotators': {'lemma', 'pos', 'ner'},
            'classpath':
            "/home/bhargavi/robust_nlp/invariance/DrQA/data/corenlp/*"
        })
        # initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}})
    )
    c_tokens = workers.map(tokenize, data['contexts'])
    workers.close()
    workers.join()

    ## code to override Pool
    # init(tokenizer_class, {'annotators': {'lemma'}, 'classpath' : "/home/bhargavi/robust_nlp/invariance/DrQA/data/corenlp/*"})
    # q_tokens = []
    # for idx in range(len(data['questions'])):
    #     q_tokens.append(tokenize(data['questions'][idx]))
    # c_tokens = []
    # for idx in range(len(data['contexts'])):
    #     c_tokens.append(tokenize(data['contexts'][idx]))

    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        context_sentence_boundaries = c_tokens[data['qid2cid']
                                               [idx]]['sentence_boundaries']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer(offsets, ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                if found:
                    ans_tokens.append(found)
        ## sentences
        ans_tokens_list = list(set(ans_tokens))
        sentences = []
        gold_sentence_ids = []
        for s_idx, tup in enumerate(context_sentence_boundaries):
            for a in ans_tokens_list:
                if a[0] >= tup[0] and a[1] < tup[1]:
                    gold_sentence_ids.append(s_idx)
                elif a[0] >= tup[0] and a[0] < tup[1] and a[1] >= tup[1]:
                    gold_sentence_ids.append(s_idx)
                    gold_sentence_ids.append(s_idx + 1)
            sentence = document[tup[0]:tup[1]]
            sentences.append(sentence)
        gold_sentence_ids_set = list(set(gold_sentence_ids))
        if len(ans_tokens_list) == 0:
            print("No golden sentence available")
        ## gold_sentence_id
        yield {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
            'sentences': sentences,
            'gold_sentence_ids': gold_sentence_ids_set,
        }
Beispiel #32
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    make_pool = partial(Pool, workers, initializer=init)
    workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    #debug
    # print(type(data))
    # print(data['contexts'])
    c_tokens = workers.map(tokenize, data['contexts'])
    #c_tokens = tokenize(data['contexts'])
    #
    q_tokens = workers.map(tokenize, data['questions'])
    workers.close()
    workers.join()

    workers = make_pool(
        initargs=(tokenizer_class, {'annotators': {'lemma', 'pos', 'ner'}})
    )
    c_tokens = workers.map(tokenize, data['contexts'])
    workers.close()
    workers.join()

    #debug
    #print('len of data[qids]: ' + str(len(data['qids'])))

    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        sent_offsets = c_tokens[data['qid2cid'][idx]]['sent_offsets']#add jyu
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
	# debug
        #print('contexts: ' + str(len(str(data['contexts']))))
        #print('document: ' + str(document))
        #document_sentences = c_tokens[data['qid2cid'][idx]]['sentences']
        #print('document_sentences: ' + document_sentences)
        #sys.exit()

        sent_offsets_distict = []#torch.IntTensor(1) 
        sent_offsets_distict.append(sent_offsets[0])
        for sent_offset in sent_offsets:
            #print(str(sent_offset))
            #print(str(sent_offsets_distict[-1]))
            if sent_offsets_distict[-1][0] < sent_offset[0]:
               sent_offsets_distict.append(sent_offset)
	#sent_offsets_distict = torch.from_numpy(sent_offsets_distict)
        sent_offsets_distict_tensor = torch.from_numpy(np.asarray(sent_offsets_distict))
        #print(type(offsets))
        #print(type(sent_offsets_distict_tensor))
        #print(type(sent_offsets_distict))
        #sent_offsets_distict_tensor =  torch.IntTensor(sent_offsets_distict_tensor)

        sent_index_offsets = []
        sent_index_offset_cur = []
        sent_index_offset_cur.append(0)
        for i in range(len(sent_offsets)-1):
            if sent_offsets[i] != sent_offsets[i+1]:
                sent_index_offset_cur.append(i)
                sent_index_offsets.append(sent_index_offset_cur)
                sent_index_offset_cur = []
                sent_index_offset_cur.append(i+1)
        sent_index_offset_cur.append(len(sent_offsets) - 1)
        sent_index_offsets.append(sent_index_offset_cur)

        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer_sentence(sent_offsets_distict_tensor, ans['answer_start'])
                '''
                found = find_answer(offsets,
                                    ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                '''
                if found:
                    ans_tokens.append(found)
        yield {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            #'sent_offsets_duplicates':sent_offsets,
            'sent_offsets':sent_index_offsets,#add for change into sentence level jyu
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
        }
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(
        db)  # drqa/retriever/__init__.py --> doc_db.py
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids(
        )  # Fetch all ids of docs stored in the db.
    DOC2IDX = {doc_id: i
               for i, doc_id in enumerate(doc_ids)
               }  # store in {'3255': 0, '8902': 1, ...}

    # Setup worker pool
    tok_class = tokenizers.get_class(
        args.tokenizer
    )  # 'corenlp', drqa/tokenizers/__init__.py --> corenlp_tokenizer.py
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step]
               for i in range(0, len(doc_ids), step)]  # total 10 batches
    _count = partial(
        count, args.ngram,
        args.hash_size)  # args.hash_size --> default=int(math.pow(2, 24))
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    """
    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
            where ``data``, ``row_ind`` and ``col_ind`` satisfy the
            relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
    
    Examples:
        >>> row = np.array([0, 0, 1, 2, 2, 2])
        >>> col = np.array([0, 2, 2, 0, 1, 2])
        >>> data = np.array([1, 2, 3, 4, 5, 6])
        >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
        array([[1, 0, 2],
               [0, 0, 3],
               [4, 5, 6]])
    
    count_matrix: shape=(args.hash_size, len(doc_ids))
    
              doc_1   doc_2  ...   doc_m
    word_1    [[1,      0,   ...    2],
    word_2     [0,      0,   ...    3],
     ...                ...
    word_n     [4,      5,   ...    6]]
    
    i.e., (word_1, doc_m) denotes word 'word_1' appear 2 times in doc 'doc_m'.
    
    Reference: https://towardsdatascience.com/machine-learning-to-big-data-scaling-inverted-indexing-with-solr-ba5b48833fb4
    """
    count_matrix = sp.csr_matrix(  # import scipy.sparse as sp
        (data, (row, col)),
        shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)
Beispiel #34
0
            for paragraph in article['paragraphs']:
                for qa in paragraph['qas']:
                    qids.append(qa['id'])
                    questions.append(qa['question'])

    # ------------------------------------------------------------------------------
    # Retrieve most relevant documents from dataset
    # ------------------------------------------------------------------------------

    ranker = retriever.get_class('tfidf')(tfidf_path=args.retriever_model)
    retrieved_doc_ids = ranker.batch_closest_docs(questions,
                                                  k=args.n_docs,
                                                  num_workers=args.num_workers)

    # define processes
    tok_class = tokenizers.get_class(args.retriever_tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {'db_path': args.doc_db}
    processes = ProcessPool(processes=args.num_workers,
                            initializer=init,
                            initargs=(tok_class, tok_opts, db_class, db_opts))

    contexts = processes.map(retrieve_documents, retrieved_doc_ids)
    examples = []
    for i, question in enumerate(questions):
        context = contexts[i][0] if len(contexts[i]) > 0 else "_"
        examples.append((context, question))

    # ------------------------------------------------------------------------------
    # Read in dataset and make predictions.