Python get_class Examples, drqa.retriever.get_class Python Examples

Example #1

0

Show file

File: qa.py Project: adamviola/piazza-qa

    def __init__(self):
        dk.set_default('corenlp_classpath', corenlp_path)
        dr.set_default('model', model_path)

        # DrQA retriever
        self.retriever = ret.get_class('tfidf')(tfidf_path=tfidf_path)

        # DrQA reader
        self.reader = dr.Predictor(model_path, "corenlp", normalize=True)

        # Answerability classifier
        self.tokenizer = BertTokenizer.from_pretrained(
            model_name, do_lower_case="uncased"
            in model_name)  # , cache_dir=cache_directory)
        self.pretrained_model = Model()

        checkpoint = torch.load(load_name,
                                map_location=lambda storage, loc: storage)
        self.pretrained_model.load_state_dict(checkpoint['state_dict'])

        self.pretrained_model.zero_grad()
        self.pretrained_model.eval()
        self.pretrained_model.freeze()
        torch.set_grad_enabled(False)

        # Creates a map from document id to
        self.docs_txt = {}
        with open(docs_json_path, encoding='utf-8') as docs_text:
            for line in docs_text:
                line = eval(line)
                self.docs_txt[line["id"]] = line["text"]

Example #2

0

Show file

def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    init(tok_class, db_class, db_opts)

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in map(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)

Example #3

0

Show file

File: simpleDrQA.py Project: zxsted/DrQA_cn

 def __init__(self, predictor, rankerPath, dbPath, ebdPath=None):
     self.predictor = predictor
     self.ranker = retriever.get_class('tfidf')(tfidf_path=rankerPath)
     conn = sqlite3.connect(dbPath)
     self.db = conn.cursor()
     self.filter = filtText('drqa/features/map.txt')
     self.score = contextScore(ebdPath)

Example #4

0

Show file

 def __init__(self, db_path, model):
     '''
     Args:
         model: tfidf model path
     '''
     self.doc_db = retriever.DocDB(db_path=db_path)
     self.ranker = retriever.get_class('tfidf')(tfidf_path=model)

Example #5

0

Show file

File: build_rank.py Project: yumoxu/DrQA

def rank(args):
    logger.info('Initializing ranker...')
    ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

    basename = os.path.splitext(os.path.basename(args.data_path))[0]
    dump_path = os.path.join(args.out_dir, f'{basename}-{args.k}.rank')
    logger.info(f'Dumping rank jsons to {dump_path}')

    with io.open(args.data_path) as json_file:
        for idx, line in enumerate(json_file):

            input_json = json.loads(line.strip('\n'))
            doc_id, doc = input_json['id'], input_json['text']

            doc_names, doc_scores = ranker.closest_docs(query=doc, k=args.k)

            dump_json = {
                'doc_id': doc_id,
                'rank_ids': list(doc_names),
                'rank_scores': list(doc_scores),
            }
            json_str = json.dumps(dump_json, ensure_ascii=False)

            with open(dump_path, 'a') as f:
                f.write(json_str + '\n')

            if idx and idx % 1000 == 0:
                logger.info(f'\t{idx} finished...')
                logger.info(f'\tExample: {json_str}')

Example #6

0

Show file

File: build_elastic_tfidf.py Project: samdash/DrQA-Elastic

def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        #doc_ids = doc_db.get_doc_ids()
     doc_ids = [] 
    res = es.search(index="htts", doc_type="htts", body={"size":500,"query": {"match_all": {}}},scroll='10m')
    scroll = res['_scroll_id']
    #logger.info(scroll)
    #for doc in res['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])     
    #res2 = es.scroll(scroll_id = scroll, scroll = '1m')   
    
    #for doc in res2['hits']['hits']:
        #print("%s" % (doc['_source']['documentId']))
     #   doc_ids.append(doc['_source']['documentId'])   
    
    scroll_id = res['_scroll_id']
    for ref in scrollr(es, scroll_id, extract_references):
        print(ref)
        doc_ids.append(ref)
           
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)

Example #7

0

Show file

def init():
    global ranker, nlp, df_topic_keywords, lda_model, vectorizer
    print('Initializing app')
    ranker = retriever.get_class('tfidf')(tfidf_path=MODEL)
    print('ranker:', ranker)
    df_topic_keywords = pd.read_pickle(ROOT_DIR / 'model' / 'df_topic_keywords.pkl')
    lda_model = pickle.load(open(ROOT_DIR / 'model' / 'best_lda_model.pkl', 'rb'))
    vocabulary = pickle.load(open(ROOT_DIR / 'model' / 'tm_features.pkl', 'rb'))
    vectorizer = CountVectorizer(decode_error='replace', vocabulary=vocabulary)
    nlp = spacy.load('en', disable=['parser', 'ner'])

Example #8

0

Show file

 def __init__(self,tfidf_path,
              tokenizer,
              use_stopwords = False,
              qclassifier = None):
     
     Answerer.__init__(self,qclassifier)
     self.tokenizer = tokenizer
     self.ranker =retriever.get_class('tfidf')(tfidf_path=tfidf_path)
     self.stopwords = stopwords
     self.use_stopwords = use_stopwords

Example #9

0

Show file

File: sample_application.py Project: tuhinjubcse/DeSePtion-ACL2020

def my_sample_fever():
    logger = logging.getLogger()
    dictConfig({
        'version': 1,
        'formatters': {
            'default': {
                'format':
                '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
            }
        },
        'handlers': {
            'wsgi': {
                'class': 'logging.StreamHandler',
                'stream': 'ext://sys.stderr',
                'formatter': 'default'
            }
        },
        'root': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
        'allennlp': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
    })

    logger.info("Columbia FEVER application")
    config = json.load(
        open(os.getenv("CONFIG_PATH", "configs/system_config.json")))

    ner_predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz"
    )
    google_config = GoogleConfig(**config['retrieval']['google'])
    ranker = retriever.get_class('tfidf')(
        tfidf_path=config['retrieval']['tfidf']['index'])

    predictors = {}
    for key in ('page_model', 'state_model'):
        path = config[key].pop('path')
        predictors[key] = ColumbiaPredictor(path, config['cuda_device'],
                                            **config[key])

    # The prediction function that is passed to the web server for FEVER2.0
    def predict(instances):
        predictions = getDocsSingle(instances, google_config, ner_predictor,
                                    ranker)
        for key in ('page_model', 'state_model'):
            predictions = list(predictors[key].predict(predictions))
        return predictions

    return fever_web_api(predict)

Example #10

0

Show file

File: top_n.py Project: deltonmyalil/ML_Lab_Project

    def __init__(self, db, n_docs, n_sents, whole_docs, compat, model):
        super().__init__(db)
        self.n_docs = n_docs
        self.n_sents = n_sents
        self.whole_docs = whole_docs
        self.compat = compat
        self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
        self.onlineranker_args = self.RankArgs()

        self.doc_titles = [
            self.ranker.get_doc_id(i) for i in range(self.ranker.num_docs)
        ]
        self.ner_retriever = NER_Retriever(self.doc_titles)

Example #11

0

Show file

    def __init__(self, name, retriever_model, num_threads):
        super().__init__(name)

        self.num_threads = min(num_threads, int(multiprocessing.cpu_count()))

        # initialize a ranker per thread
        self.arguments = []
        for id in tqdm(range(self.num_threads)):
            self.arguments.append({
                "id":
                id,
                "ranker":
                retriever.get_class("tfidf")(tfidf_path=retriever_model),
            })

Example #12

0

Show file

    def __init__(
        self,
        db,
        model,
        max_page,
        max_sent,
    ):
        self.db = db
        self.n_docs = max_page
        self.n_sents = max_sent
        self.model = model

        self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
        self.onlineranker_args = self.RankArgs()

Example #13

0

Show file

File: build_tfidf.py Project: Inistlwq/SQuAD-summary

def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    #多线程编程
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))
    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    #分批写入矩阵
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    #partial：偏函数
    """
    函数在执行时，要带上所有必要的参数进行调用。但是，有时参数可以在函数被调用之前提前获知。这种情况下，一个函数有
    一个或多个参数预先就能用上，以便函数能用更少的参数进行调用。
    """
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)

Example #14

0

Show file

def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}
    # 5075182
    logger.info('the number of docs is %s' % (len(DOC2IDX)))

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping......')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 24 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 24)
        k = 0
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
            k += 1
            if k % 10000 == 0:
                logger.info('NO: %s is ......' % k)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix......')
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)

Example #15

0

Show file

File: build_tfidf.py Project: athiwatp/DrQA

def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(
        args.num_workers,
        initializer=init,
        initargs=(tok_class, db_class, db_opts)
    )

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)]
    _count = partial(count, args.ngram, args.hash_size)
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) + '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    count_matrix = sp.csr_matrix(
        (data, (row, col)), shape=(args.hash_size, len(doc_ids))
    )
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)

Example #16

0

Show file

def get_count_matrix_sklearn(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(db)
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids()
    DOC2IDX = {doc_id: i for i, doc_id in enumerate(doc_ids)}

    hashvec = HashingVectorizer(n_features=2**24,
                                dtype=np.int8,
                                ngram_range=(1, 2),
                                norm=None,
                                non_negative=True)
    chunk_size = 100000

    texts = []
    chunks = []
    db = db_class(**db_opts)
    for i, doc_id in enumerate(doc_ids):
        #if i == 100000: break
        texts.append(db.get_doc_text(doc_id))
        if i % chunk_size == 0:
            if i > 0:
                print(i, 'fitting hashvec...')
                chunks.append(hashvec.transform(texts))
                del texts[:]
    chunks.append(hashvec.transform(texts))

    count_matrix = sp.vstack(chunks)
    count_matrix = count_matrix.transpose()

    print(count_matrix.shape)
    print(count_matrix.dtype)

    return count_matrix, (DOC2IDX, doc_ids)

Example #17

0

Show file

class MyTfidfDocRanker(retriever.get_class('tfidf')):
    def text2spvec(self, query, data_val=False):
        """Create a sparse tfidf-weighted word vector from query.

        tfidf = log(tf + 1) * log((N - Nt + 0.5) / (Nt + 0.5))
        """
        # Get hashed ngrams
        words = self.parse(utils.normalize(query))
        wids = [utils.hash(w, self.hash_size) for w in words]

        if len(wids) == 0:
            if self.strict:
                raise RuntimeError('No valid word in: %s' % query)
            else:
                logger.warning('No valid word in: %s' % query)
                return sp.csr_matrix((1, self.hash_size))

        # Count TF
        wids_unique, wids_counts = np.unique(wids, return_counts=True)
        tfs = np.log1p(wids_counts)

        # Count IDF
        Ns = self.doc_freqs[wids_unique]
        idfs = np.log((self.num_docs - Ns + 0.5) / (Ns + 0.5))
        idfs[idfs < 0] = 0

        # TF-IDF
        data = np.multiply(tfs, idfs)

        if data_val:
            return data, wids_unique

        # One row, sparse csr matrix
        indptr = np.array([0, len(wids_unique)])
        spvec = sp.csr_matrix(
            (data, wids_unique, indptr), shape=(1, self.hash_size)
        )

        return spvec

Example #18

0

Show file

File: eval.py Project: rahular/ellipsis-baselines

    start = time.time()

    # read all the data and store it
    logger.info("Reading data ...")
    questions = []
    answers = []
    for line in open(args.dataset):
        data = json.loads(line)
        question = data["question"]
        answer = data["answer"]
        questions.append(question)
        answers.append(answer)

    # get the closest docs for each question.
    logger.info("Initializing ranker...")
    ranker = retriever.get_class("tfidf")(tfidf_path=args.model)

    logger.info("Ranking...")
    closest_docs = ranker.batch_closest_docs(
        questions, k=args.n_docs, num_workers=args.num_workers
    )
    answers_docs = zip(answers, closest_docs)

    # define processes
    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {"db_path": args.doc_db}
    processes = ProcessPool(
        processes=args.num_workers,
        initializer=init,

Example #19

0

Show file

import time
import sqlite3
from drqa import retriever
import numpy as np
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from itertools import chain
import pandas as pd
from drqa.retriever import utils
import os
import pandas as pd

db = "/home/giuseppe/Scrivania/HLT_Project/Retriver/Process_gnq/gnq_articles.db"
connection = sqlite3.connect(db, check_same_thread=False)
tfidf = "/home/giuseppe/Scrivania/HLT_Project/Retriver/DrQA/gnq_articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"
ranker = retriever.get_class('tfidf')(tfidf_path=tfidf)
qa_db = "/home/giuseppe/Scrivania/HLT_Project/Retriver/Process_gnq/gnq_qa.db"


def get_doc_text(doc_id):
    """Fetch the raw text of the doc for 'doc_id'."""
    cursor = connection.cursor()
    cursor.execute("SELECT text FROM documents WHERE id = ?",
                   (utils.normalize(doc_id), ))
    result = cursor.fetchone()
    cursor.close()
    return result if result is None else result[0]


def _split_doc(doc):
    """Given a doc, split it into chunks (by paragraph)."""

Example #20

0

Show file

File: generate.py Project: athiwatp/DrQA

        question = data['question']
        answer = data['answer']

        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning('Regex failed to compile: %s' % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ('regex', 'match_threshold', 'char_max',
                   'char_min', 'window_sz')
    opts = {
        'ranker_class': retriever.get_class(args.ranker),
        'tokenizer_class': tokenizers.get_class(args.tokenizer),
        'db_class': retriever.get_class(args.db),
        'search': {k: vars(args)[k] for k in search_keys},
    }
    opts.update(vars(args))

    # Process!

Example #21

0

Show file

def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(
        db)  # get specify db class instance to get documents
    with db_class(**db_opts) as doc_db:  # context management
        doc_ids = doc_db.get_doc_ids()  # get all doc ids
    '''
        enumerate(list) wrap a list to dic as follow:
        list=['a','b','c']
        enumerate(list)=dict{0: 'a', 1: 'b', 2: 'c'}
        
        so iterate enumerate(list) return two values:index(start from 0) and value of origin list
    '''
    DOC2IDX = {doc_id: i
               for i, doc_id in enumerate(doc_ids)
               }  # get doc to index maps from doc_ids

    # Setup worker pool
    tok_class = tokenizers.get_class(args.tokenizer)
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)  # get count of steps
    batches = [doc_ids[i:i + step] for i in range(0, len(doc_ids), step)
               ]  # calc the batch range of each step

    # redefine function signature. use some defaults args to wrap a function object and return
    # a callable object.
    # refer link:http://www.wklken.me/posts/2013/08/18/python-extra-functools.html
    _count = partial(count, args.ngram, args.hash_size)

    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            # three lists extend when each step
            row.extend(
                b_row
            )  # list[.../step..../step...] hash(n-gram(token from doc))
            col.extend(b_col)  # list[.../step..../step...] index of doc
            data.extend(
                b_data
            )  # list[.../step..../step...] value of count of n-gram(token from doc)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    '''生成的稀疏矩阵示例
       hash(N-gram) --------------(col)
       index of doc | element=count(hash) 
                    |
                    |
                    |
                    (row)
    '''
    count_matrix = sp.csr_matrix((data, (row, col)),
                                 shape=(args.hash_size, len(doc_ids)))

    # 将矩阵中实体元素相同的进行相加合并
    count_matrix.sum_duplicates()
    # 输出矩阵，以及其他
    return count_matrix, (DOC2IDX, doc_ids)

Example #22

0

Show file

File: generate.py Project: to314as/Question-answering-with-Wikipedia---NLP-project-2019

        question = data['question']
        answer = data['answer']

        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning('Regex failed to compile: %s' % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ('regex', 'match_threshold', 'char_max', 'char_min',
                   'window_sz')
    opts = {
        'ranker_class': retriever.get_class(args.ranker),
        'tokenizer_class': tokenizers.get_class(args.tokenizer),
        'db_class': retriever.get_class(args.db),
        'search': {k: vars(args)[k]
                   for k in search_keys},
    }
    opts.update(vars(args))

Example #23

0

Show file

def eval_model(db: FeverDocDB, args) -> Model:
    archive = load_archive(args.archive_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=TokenIndexer.dict_from_params(
                             ds_params.pop('token_indexers', {})))

    while True:

        claim = input("enter claim (or q to quit) >>")
        if claim.lower() == "q":
            break

        ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

        p_lines = []
        pages, _ = ranker.closest_docs(claim, 5)

        for page in pages:
            lines = db.get_doc_lines(page)
            lines = [
                line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
                for line in lines.split("\n")
            ]

            p_lines.extend(zip(lines, [page] * len(lines), range(len(lines))))

        scores = tf_idf_sim(claim, [pl[0] for pl in p_lines])
        scores = list(
            zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines],
                [pl[0] for pl in p_lines]))
        scores = list(filter(lambda score: len(score[3].strip()), scores))
        sentences_l = list(
            sorted(scores, reverse=True, key=lambda elem: elem[0]))

        sentences = [s[3] for s in sentences_l[:5]]
        evidence = " ".join(sentences)

        print("Best pages: {0}".format(repr(pages)))

        print("Evidence:")
        for idx, sentence in enumerate(sentences_l[:5]):
            print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0],
                                                sentence[1], sentence[3]))

        item = reader.text_to_instance(evidence, claim)

        prediction = model.forward_on_instance(item, args.cuda_device)
        cls = model.vocab._index_to_token["labels"][np.argmax(
            prediction["label_probs"])]
        print("PREDICTED: {0}".format(cls))
        print()

Example #24

0

Show file

 def __init__(self, saved_model_path):
     self.ranker = retriever.get_class('tfidf')(tfidf_path=saved_model_path)

Example #25

0

Show file

File: negative_sample_nearest.py Project: jorgeecardona/fever-allennlp

def process(ranker, query, k=1):
    doc_names, doc_scores = ranker.closest_docs(query, k)

    return doc_names


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--in-file', type=str)
    parser.add_argument('--out-file', type=str)
    parser.add_argument('--index', type=str)
    parser.add_argument('--count', type=int, default=1)
    args = parser.parse_args()

    k = args.count
    ranker = retriever.get_class('tfidf')(tfidf_path=args.index)

    with open(args.in_file) as f:
        with open(args.out_file, "w+") as f2:
            for line in tqdm(f.readlines()):
                line = json.loads(line)

                if line["label"] == "NOT ENOUGH INFO":
                    pages = process(ranker, line['claim'], k=k)
                    pp = list(pages)

                    for idx, evidence_group in enumerate(line['evidence']):
                        for evidence in evidence_group:
                            if idx < len(pp):
                                evidence[2] = pp[idx]
                                evidence[3] = -1

Example #26

0

Show file

File: generate.py Project: rahular/ellipsis-baselines

        question = data["question"]
        answer = data["answer"]

        # Make sure the regex compiles
        if args.regex:
            try:
                re.compile(answer[0])
            except BaseException:
                logger.warning("Regex failed to compile: %s" % answer)
                continue

        questions.append(question)
        answers.append(answer)

    # Get classes
    ranker_class = retriever.get_class(args.ranker)
    db_class = retriever.get_class(args.db)
    tokenizer_class = tokenizers.get_class(args.tokenizer)

    # Form options
    search_keys = ("regex", "match_threshold", "char_max", "char_min",
                   "window_sz")
    opts = {
        "ranker_class": retriever.get_class(args.ranker),
        "tokenizer_class": tokenizers.get_class(args.tokenizer),
        "db_class": retriever.get_class(args.db),
        "search": {k: vars(args)[k]
                   for k in search_keys},
    }
    opts.update(vars(args))

Example #27

0

Show file

File: top_n.py Project: neverneverendup/fever-naacl-2018

 def __init__(self, db, n_docs, n_sents, model):
     super().__init__(db)
     self.n_docs = n_docs
     self.n_sents = n_sents
     self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
     self.onlineranker_args = self.RankArgs()

Example #28

0

Show file

 def __init__(self, database, index, n_docs, n_sents):
     super().__init__(database)
     self.n_docs = n_docs
     self.n_sents = n_sents
     self.ranker = retriever.get_class('tfidf')(tfidf_path=index)
     self.onlineranker_args = self.RankArgs()

Example #29

0

Show file

File: predict_pages_drqa.py Project: ryparmar/master-thesis

 def __init__(self, db, k, model):
     self.db = db
     self.k = k
     self.model = model
     self.ranker = retriever.get_class('tfidf')(tfidf_path=self.model)

Example #30

0

Show file

    # read all the data and store it
    logger.info('Reading data ...')
    questions = []
    answers = []

    for line in open(args.dataset):
        data = json.loads(line)
        question = data['question']
        answer = data['answer']
        questions.append(question)
        answers.append(answer)

    # get the closest docs for each question.
    logger.info('Initializing ranker...')
    ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

    logger.info('Ranking...')
    closest_docs = ranker.batch_closest_docs(questions,
                                             k=args.n_docs,
                                             num_workers=args.num_workers)
    ranker = []

    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {'db_path': args.doc_db}
    PROCESS_TOK = tok_class(**tok_opts)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)

Example #31

0

Show file

File: interactive.py Project: athiwatp/DrQA

import logging
from drqa import retriever

logger = logging.getLogger()
logger.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p')
console = logging.StreamHandler()
console.setFormatter(fmt)
logger.addHandler(console)

parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default=None)
args = parser.parse_args()

logger.info('Initializing ranker...')
ranker = retriever.get_class('tfidf')(tfidf_path=args.model)


# ------------------------------------------------------------------------------
# Drop in to interactive
# ------------------------------------------------------------------------------


def process(query, k=1):
    doc_names, doc_scores = ranker.closest_docs(query, k)
    table = prettytable.PrettyTable(
        ['Rank', 'Doc Id', 'Doc Score']
    )
    for i in range(len(doc_names)):
        table.add_row([i + 1, doc_names[i], '%.5g' % doc_scores[i]])
    print(table)

Example #32

0

Show file

File: build_tfidf.py Project: yongbowin/DrQA_annotation

def get_count_matrix(args, db, db_opts):
    """Form a sparse word to document count matrix (inverted index).

    M[i, j] = # times word i appears in document j.
    """
    # Map doc_ids to indexes
    global DOC2IDX
    db_class = retriever.get_class(
        db)  # drqa/retriever/__init__.py --> doc_db.py
    with db_class(**db_opts) as doc_db:
        doc_ids = doc_db.get_doc_ids(
        )  # Fetch all ids of docs stored in the db.
    DOC2IDX = {doc_id: i
               for i, doc_id in enumerate(doc_ids)
               }  # store in {'3255': 0, '8902': 1, ...}

    # Setup worker pool
    tok_class = tokenizers.get_class(
        args.tokenizer
    )  # 'corenlp', drqa/tokenizers/__init__.py --> corenlp_tokenizer.py
    workers = ProcessPool(args.num_workers,
                          initializer=init,
                          initargs=(tok_class, db_class, db_opts))

    # Compute the count matrix in steps (to keep in memory)
    logger.info('Mapping...')
    row, col, data = [], [], []
    step = max(int(len(doc_ids) / 10), 1)
    batches = [doc_ids[i:i + step]
               for i in range(0, len(doc_ids), step)]  # total 10 batches
    _count = partial(
        count, args.ngram,
        args.hash_size)  # args.hash_size --> default=int(math.pow(2, 24))
    for i, batch in enumerate(batches):
        logger.info('-' * 25 + 'Batch %d/%d' % (i + 1, len(batches)) +
                    '-' * 25)
        for b_row, b_col, b_data in workers.imap_unordered(_count, batch):
            row.extend(b_row)
            col.extend(b_col)
            data.extend(b_data)
    workers.close()
    workers.join()

    logger.info('Creating sparse matrix...')
    """
    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
            where ``data``, ``row_ind`` and ``col_ind`` satisfy the
            relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
    
    Examples:
        >>> row = np.array([0, 0, 1, 2, 2, 2])
        >>> col = np.array([0, 2, 2, 0, 1, 2])
        >>> data = np.array([1, 2, 3, 4, 5, 6])
        >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
        array([[1, 0, 2],
               [0, 0, 3],
               [4, 5, 6]])
    
    count_matrix: shape=(args.hash_size, len(doc_ids))
    
              doc_1   doc_2  ...   doc_m
    word_1    [[1,      0,   ...    2],
    word_2     [0,      0,   ...    3],
     ...                ...
    word_n     [4,      5,   ...    6]]
    
    i.e., (word_1, doc_m) denotes word 'word_1' appear 2 times in doc 'doc_m'.
    
    Reference: https://towardsdatascience.com/machine-learning-to-big-data-scaling-inverted-indexing-with-solr-ba5b48833fb4
    """
    count_matrix = sp.csr_matrix(  # import scipy.sparse as sp
        (data, (row, col)),
        shape=(args.hash_size, len(doc_ids)))
    count_matrix.sum_duplicates()
    return count_matrix, (DOC2IDX, doc_ids)

Example #33

0

Show file

def configurate_server(server, tfidf_path):
    server.handler_params = {
        "ranker": get_class("tfidf")(tfidf_path=tfidf_path, strict=False)
    }