Beispiel #1
0
 def __init__(self):
     # init pyltp tool
     annotators = {'ner', 'pos'}
     ltp_tokenizer = tokenizers.get_class('ltp')
     self.tokenizer = ltp_tokenizer(annotators=annotators)
     # regex pattern for dates detection. Need to be improved.
     self.date_pattern = '\d{2,4}(年|月|日|世纪|年代)'
    def __init__(self, tfidf_path=None, strict=True):
        """
        Args:
            tfidf_path: path to saved model file
            strict: fail on empty queries or continue (and return empty result)
        """
        # Load from disk
        tfidf_path = tfidf_path or DEFAULTS['tfidf_path']
        logger.info('Loading %s' % tfidf_path)
        matrix, metadata = utils.load_sparse_csr(tfidf_path)
        self.doc_mat = matrix
        self.ngrams = metadata['ngram']
        self.hash_size = metadata['hash_size']
        self.tokenizer = tokenizers.get_class(metadata['tokenizer'])()
        self.doc_freqs = metadata['doc_freqs'].squeeze()
        self.doc_dict = metadata['doc_dict']
        self.num_docs = len(self.doc_dict[0])
        self.strict = strict

        self.csc_matrix = None
        self.unigrams = metadata['unigrams']
        self.bigrams = metadata['bigrams']
        self.hash2gram = metadata['hash2gram']
        self.title_tfidf = metadata['title_tfidf']

        self.titles_tokens = []
        self.title_csc_matrix = None
        self.titles_lens = None
Beispiel #3
0
def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()[:args.num_docs]
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    unigrams, bigrams = [], []
    hash2gram = {}
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data, b_unigrams, b_bigrams, b_hash2gram in workers.imap_unordered(
                _count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            unigrams.extend(b_unigrams)
            bigrams.extend(b_bigrams)
            hash2gram.update(b_hash2gram)
    workers.close()
    workers.join()

    unigrams = list(set(unigrams))
    bigrams = list(set(bigrams))

    logger.info('Creating sparse matrix...')

    count_matrix = None
    if args.matrix_type == 'csr':
        count_matrix = sp.csr_matrix((data, (row, col)),
                                     shape=(args.hash_size, len(doc_ids)))
        count_matrix.sum_duplicates()
    elif args.matrix_type == 'csc':
        count_matrix = sp.csr_matrix((data, (row, col)),
                                     shape=(args.hash_size, len(doc_ids)))
        count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids), (unigrams, bigrams, hash2gram)
Beispiel #4
0
def build_simhash(args, source='db'):
    title2text = {}
    titles = {}
    # retrieve docs from db
    if source == 'db':
        title2text = read_docs_from_db(args, args.doc_db, args.db_opts)
    # retrieve docs from json
    elif source == 'json':
        title2text = read_drqa_format_dataset_as_dict(args.json_path)
        titles = list(title2text.keys())
        # control number when testing code
        if args.num_docs > 0:
            titles = titles[:args.num_docs]
            title2text = {title: title2text[title] for title in titles}

    logger.info('Mapping...')
    title2hash = []
    tok_class = tokenizers.get_class(args.tokenizer)
    # multiprocessing
    if args.work_type == 'multi':
        # Setup worker pool
        workers = ProcessPool(args.num_workers,
                              initializer=init,
                              initargs=(tok_class,
                                        retriever.get_class(args.doc_db), {
                                            'db_path': args.doc_db
                                        }))
        step = max(int(len(title2text) / 10), 1)
        batches = [titles[i:i + step] for i in range(0, len(titles), step)]
        _convert = partial(title2text_dic_2_title2hash_dic, title2text)

        # map doc text to simhash using multiprocess

        for i, batch in enumerate(batches):
            logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                        '-' * 25)
            for title, simhash in workers.imap_unordered(_convert, batch):
                title2hash.append((title, simhash))
        workers.close()
        workers.join()

    # single processing
    elif args.work_type == 'single':
        with tqdm(total=len(title2text)) as pbar:
            for (k, v) in title2text.items():
                title2hash.append(
                    title2text_dic_2_title2hash_dic(title2text, k))
                pbar.update()
    return title2hash
Beispiel #5
0
def process_dataset(data, tokenizer, workers=None):
    """Iterate processing (tokenize, parse, etc) dataset multithreaded."""
    tokenizer_class = tokenizers.get_class(tokenizer)
    make_pool = partial(Pool, workers, initializer=init)
    workers = make_pool(initargs=(tokenizer_class, {'annotators': {'lemma'}}))
    q_tokens = workers.map(tokenize, data['questions'])
    workers.close()
    workers.join()

    workers = make_pool(initargs=(tokenizer_class, {
        'annotators': {'lemma', 'pos', 'ner'}
    }))
    c_tokens = workers.map(tokenize, data['contexts'])
    workers.close()
    workers.join()

    for idx in range(len(data['qids'])):
        question = q_tokens[idx]['words']
        qlemma = q_tokens[idx]['lemma']
        document = c_tokens[data['qid2cid'][idx]]['words']
        offsets = c_tokens[data['qid2cid'][idx]]['offsets']
        lemma = c_tokens[data['qid2cid'][idx]]['lemma']
        pos = c_tokens[data['qid2cid'][idx]]['pos']
        ner = c_tokens[data['qid2cid'][idx]]['ner']
        ans_tokens = []
        if len(data['answers']) > 0:
            for ans in data['answers'][idx]:
                found = find_answer(offsets, ans['answer_start'],
                                    ans['answer_start'] + len(ans['text']))
                if found:
                    ans_tokens.append(found)
        yield {
            'id': data['qids'][idx],
            'question': question,
            'document': document,
            'offsets': offsets,
            'answers': ans_tokens,
            'qlemma': qlemma,
            'lemma': lemma,
            'pos': pos,
            'ner': ner,
        }
Beispiel #6
0
def get_title_tfidf_matrix(args, db, db_opts, doc_freqs):
    """"""
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()[:args.num_docs]
    tokenizer = tokenizers.get_class(args.tokenizer)()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    rows, cols, datas = [], [], []
    for doc_id in doc_ids:
        words = tokenizer.tokenize(doc_id).ngrams(
            n=args.ngram, uncased=True, filter_fn=retriever.utils.filter_ngram)
        wids = [retriever.utils.hash(w, args.hash_size) for w in words]

        if len(wids) == 0:
            logger.warning('No valid word in: %s' % doc_id)
            continue

        # Count TF
        wids_unique, wids_counts = np.unique(wids, return_counts=True)
        tfs = np.log1p(wids_counts)

        # Count IDF
        Ns = doc_freqs[wids_unique]
        idfs = np.log((len(doc_ids) - Ns + 0.5) / (Ns + 0.5))
        idfs[idfs < 0] = 0

        # TF-IDF
        data = np.multiply(tfs, idfs)

        # add row num, col num and data
        rows.extend(wids_unique)
        cols.extend([DOC2IDX[doc_id]] * len(data))
        datas.extend(data)

    # build scipy sparse csr_matrix
    tfidf_matrix = sp.csr_matrix((datas, (rows, cols)),
                                 shape=(args.hash_size, len(doc_ids)))
    tfidf_matrix.sum_duplicates()

    return tfidf_matrix
"""Document retriever based on bm25 for comparision with default weight-tfidf model."""

import sys
sys.path.append('/home/zrx/projects/MbaQA/')

from tqdm import tqdm
from gensim import corpora
from gensim.summarization import bm25

from mbaqa import retriever, tokenizers

docdb = retriever.get_class('sqlite')()
tokenizer = tokenizers.get_class('ltp')()

titles = docdb.get_doc_ids()[:]
IDX2TITLE = {idx: titles[idx] for idx in range(len(titles))}

stop_words_path = '../../data/stopwords/stopwords.txt'
stopwords = []
with open(stop_words_path, encoding='utf8') as file:
    for line in file:
        stopwords.append(line.replace('\n', '').strip())


corpus = []
with tqdm(total=len(titles)) as pbar:
    for title in titles:
        # Tokenize
        tokens = tokenizer.tokenize(retriever.utils.normalize(docdb.get_doc_text(title)))

        # Get ngrams from tokens, with stopword/punctuation filtering.
Beispiel #8
0
from functools import partial
from simhash import Simhash, SimhashIndex

logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p')
console = logging.StreamHandler()
console.setFormatter(fmt)
logger.addHandler(console)

# ------------------------------------------------------------------------------
# Multiprocessing functions
# ------------------------------------------------------------------------------

DOC2IDX = None
PROCESS_TOK = tokenizers.get_class('ltp')()
PROCESS_DB = None


def init(tokenizer_class, db_class, db_opts):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)


def fetch_text(doc_id):
    global PROCESS_DB
    return PROCESS_DB.get_doc_text(doc_id)
Beispiel #9
0
    logger.info('Ranking...')
    closest_docs = ranker.batch_closest_docs(questions,
                                             k=args.n_docs,
                                             title_weight=args.title_weight,
                                             num_workers=args.num_workers)

    # closest_docs = []
    # with tqdm(total=len(questions)) as pbar:
    #     for question in questions:
    #         closest_docs.append(ranker.closest_docs_by_content_and_title(question, title_weight=args.title_weight, k=5))
    #         pbar.update()

    answers_docs = zip(answers, closest_docs)

    # define processes
    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {'db_path': args.doc_db}
    processes = ProcessPool(processes=args.num_workers,
                            initializer=init,
                            initargs=(tok_class, tok_opts, db_class, db_opts))

    # compute the scores for each pair, and print the statistics
    logger.info('Retrieving and computing scores...')
    get_score_partial = partial(get_score, match=args.match)
    scores = processes.map(get_score_partial, answers_docs)

    # get failing questions
    failing_questions = [(questions[i], answers[i][0], closest_docs[i])
                         for i in range(len(scores)) if scores[i] == 0]