Esempio n. 1
0
 def __init__(self, tokenizer, reader_model=None, batch_size=64, 
              qclassifier=None, cuda=False):
     
     """
     Args
     
     drqa (string): 
     """
     
     print ("Tokenizer", tokenizer)
     Answerer.__init__(self,qclassifier)
     self.batch_size = batch_size
     self.n_docs = 5
     self.top_n = 1
     self.ts = TextSimilarity()
     print ("Reader model", reader_model, cuda)
     self.drqa = pipeline.DrQA(
                 reader_model=reader_model,
                 fixed_candidates=None,
                 embedding_file=None,
                 tokenizer="spacy",
                 batch_size=batch_size,
                 cuda=cuda,
                 data_parallel=False,
                 ranker_config={'options': {'tfidf_path': None,
                                            'strict': False}},
                 db_config={'options': {'db_path': None}},
                 num_workers=1,
             )
Esempio n. 2
0
 def __init__(self, reader, retriever, doc_db):
     self.DrQA = pipeline.DrQA(
         cuda=True,
         fixed_candidates=None,
         reader_model=reader,
         ranker_config={'options': {
             'tfidf_path': retriever
         }},
         db_config={'options': {
             'db_path': doc_db
         }},
         tokenizer=None)
Esempio n. 3
0
    def __init__(self, wiki_path, tfidf_path):

        DEFAULTS['db_path'] = wiki_path
        DEFAULTS['tfidf_path'] = tfidf_path

        # 输出日志
        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
        fmt = logging.Formatter('%(asctime)s: [ %(message)s ]',
                                '%m/%d/%Y %I:%M:%S %p')
        console = logging.StreamHandler()
        console.setFormatter(fmt)
        logger.addHandler(console)

        self.qa = pipeline.DrQA()
Esempio n. 4
0
if args.candidate_file:
    logger.info('Loading candidates from %s' % args.candidate_file)
    candidates = set()
    with open(args.candidate_file) as f:
        for line in f:
            line = utils.normalize(line.strip()).lower()
            candidates.add(line)
    logger.info('Loaded %d candidates.' % len(candidates))
else:
    candidates = None

logger.info('Initializing pipeline...')
DrQA = pipeline.DrQA(cuda=args.cuda,
                     fixed_candidates=candidates,
                     reader_model=args.reader_model,
                     ranker_config={'class': OnsDocRanker},
                     db_config={'class': OnsSearchDB},
                     tokenizer=args.tokenizer)

# ------------------------------------------------------------------------------
# Drop in to interactive mode
# ------------------------------------------------------------------------------


def process(question, candidates=None, top_n=1, n_docs=100):
    predictions = DrQA.process(question,
                               candidates,
                               top_n,
                               n_docs,
                               return_context=True)
    table = prettytable.PrettyTable(
Esempio n. 5
0
    candidates = set()
    with open(args.candidate_file) as f:
        for line in f:
            line = utils.normalize(line.strip()).lower()
            candidates.add(line)
    logger.info('Loaded %d candidates......' % len(candidates))
else:
    candidates = None

logger.info('Initializing pipeline......')
DrQA = pipeline.DrQA(
    cuda=args.cuda,
    fixed_candidates=candidates,
    reader_model=args.reader_model,
    ranker_config={'options': {
        'tfidf_path': args.retriever_model
    }},
    db_config={'options': {
        'db_path': args.doc_db
    }},
    tokenizer=args.tokenizer)

# ------------------------------------------------------------------------------
# Drop in to interactive mode
# ------------------------------------------------------------------------------


def process(question, candidates=None, top_n=1, n_docs=5):
    predictions = DrQA.process(question,
                               candidates,
                               top_n,
Esempio n. 6
0
}

cuda = torch.cuda.is_available() and not config.get('no-cuda', False)
if cuda:
    torch.cuda.set_device(config.get('gpu', 0))
    logger.info('CUDA enabled (GPU %d)' % config.get('gpu', 0))
else:
    logger.info('Running on CPU only.')

logger.info('Initializing pipeline...')
DrQA = pipeline.DrQA(
    cuda=cuda,
    reader_model=config['reader-model'],
    ranker_config={'options': {
        'tfidf_path': config['retriever-model']
    }},
    db_config={'options': {
        'db_path': config['doc-db']
    }},
    tokenizer=config['tokenizer'],
    embedding_file=config['embedding-file'],
)


def process(question, candidates=None, top_n=1, n_docs=5):
    predictions = DrQA.process(question,
                               candidates,
                               top_n,
                               n_docs,
                               return_context=True)
    answers = []
    for i, p in enumerate(predictions, 1):
Esempio n. 7
0
    logger.info('Loaded %d candidates.' % len(candidates))
else:
    candidates = None

logger.info('Initializing pipeline...')
DrQA = pipeline.DrQA(
    reader_model=args.reader_model,
    fixed_candidates=candidates,
    embedding_file=args.embedding_file,
    tokenizer=args.tokenizer,
    batch_size=args.batch_size,
    cuda=args.cuda,
    data_parallel=args.parallel,
    ranker_config={
        'options': {
            'tfidf_path': args.retriever_model,
            'strict': False
        }
    },
    #ranker_config={'options': {'index_path': args.retriever_model,
    #'strict': False}},
    db_config={'options': {
        'db_path': args.doc_db
    }},
    num_workers=args.num_workers,
)

# ------------------------------------------------------------------------------
# Read in dataset and make predictions
# ------------------------------------------------------------------------------
Esempio n. 8
0
    with open(args.candidate_file) as f:
        for line in f:
            line = utils.normalize(line.strip()).lower()
            candidates.add(line)
    logger.info('Loaded %d candidates.' % len(candidates))
else:
    candidates = None

'''


@app.route('/get_query', methods=['GET'])
def process():
    #DrQA = pipeline.DrQA()
    req = request.args
    question = req['query']
    #DrQA = pipeline.DrQA()
    predictions = DrQA.process(question,
                               candidates=None,
                               top_n=1,
                               n_docs=3,
                               return_context=True)
    print(predictions[0]['span'])
    return predictions[0]['span']


if __name__ == '__main__':

    DrQA = pipeline.DrQA()
    app.run(host='0.0.0.0', debug=True)
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--reader-model',
                        type=str,
                        default=None,
                        help='Path to trained Document Reader model')
    parser.add_argument('--retriever-model',
                        type=str,
                        default=None,
                        help='Path to Document Retriever model (tfidf)')
    parser.add_argument('--doc-db',
                        type=str,
                        default=None,
                        help='Path to Document DB')
    parser.add_argument(
        '--tokenizer',
        type=str,
        default=None,
        help="String option specifying tokenizer type to use (e.g. 'corenlp')")
    parser.add_argument(
        '--candidate-file',
        type=str,
        default=None,
        help=
        "List of candidates to restrict predictions to, one candidate per line"
    )
    parser.add_argument('--no-cuda', action='store_true', help="Use CPU only")
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help="Specify GPU device id to use")
    parser.add_argument(
        '--skip-to',
        metavar='QID',
        help='Start from topic QID and skip over all the previous ones')
    parser.add_argument('--use-desc-topics',
                        metavar='FILE',
                        help='Use desc queries pulled from FILE instead')
    parser.add_argument('corpus_json')
    parser.add_argument('output_json')
    args = parser.parse_args()

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    if args.cuda:
        torch.cuda.set_device(args.gpu)
        logger.info('CUDA enabled (GPU %d)' % args.gpu)
    else:
        logger.info('Running on CPU only.')

    if args.candidate_file:
        logger.info('Loading candidates from %s' % args.candidate_file)
        candidates = set()
        with open(args.candidate_file) as f:
            for line in f:
                line = utils.normalize(line.strip()).lower()
                candidates.add(line)
        logger.info('Loaded %d candidates.' % len(candidates))
    else:
        candidates = None

    logger.info('Loading query topics from %s (may take a while)' %
                args.corpus_json)
    topics = [rl['topic'] for rl in json.load(smart_open(args.corpus_json))]
    logger.info('Loaded %d topics.' % len(topics))

    if args.skip_to:
        found = None
        for i in range(len(topics)):
            if topics[i]['qid'] == args.skip_to:
                found = i
                break
        if found is None:
            topics = []
        else:
            topics = topics[found:]

    if args.use_desc_topics:
        logger.info('Loading desc topics (to override title queries)')
        desc_topics = {
            t['qid']: t
            for t in json.load(smart_open(args.use_desc_topics))
        }
        for i in range(len(topics)):
            qid = topics[i]['qid']

            logger.info('%s: %s => %s' %
                        (qid, topics[i]['title'], desc_topics[qid]['desc']))
            topics[i]['title'] = desc_topics[qid]['desc']

    logger.info('Initializing pipeline...')
    DrQA = pipeline.DrQA(
        cuda=args.cuda,
        fixed_candidates=candidates,
        reader_model=args.reader_model,
        ranker_config={
            'class': RetrievedDocRanker,
            'options': {
                'topics': topics
            }
        },
        db_config={'options': {
            'db_path': args.doc_db
        }},
        tokenizer=args.tokenizer,
        num_workers=16,
        max_loaders=2,
    )

    # ------------------------------------------------------------------------------
    # Drop in to interactive mode
    # ------------------------------------------------------------------------------

    title_queries = [topic['title'] for topic in topics]
    output = smart_open(args.output_json, 'a')
    ranked_lists = []

    for topic in topics:
        predictions = DrQA.process(topic['title'],
                                   None,
                                   top_n=100,
                                   n_docs=100,
                                   return_context=True)
        passages = {}
        psg_scores = {}
        for p in predictions:
            docno = p['doc_id'][p['doc_id'].find('.') + 1:]
            passages[docno] = p['context']['text']
            psg_scores[docno] = p['span_score']

        res = {
            'qid': topic['qid'],
            'title': topic['title'],
            'scores': topic['scores'],
            'psg_scores': psg_scores,
            'passages': passages
        }
        ranked_lists.append(res)
        logger.info('qid %s, %d passage scores returned' %
                    (res['qid'], len(res['psg_scores'])))
        payload = json.dumps(res)
        print(payload, file=output)
Esempio n. 10
0
            candidates.add(line)
    logger.info("Loaded %d candidates." % len(candidates))
else:
    candidates = None

logger.info("Initializing pipeline...")
DrQA = pipeline.DrQA(
    reader_model=args.reader_model,
    fixed_candidates=candidates,
    embedding_file=args.embedding_file,
    tokenizer=args.tokenizer,
    batch_size=args.batch_size,
    cuda=args.cuda,
    data_parallel=args.parallel,
    ranker_config={
        "options": {
            "tfidf_path": args.retriever_model,
            "strict": False
        }
    },
    db_config={"options": {
        "db_path": args.doc_db
    }},
    num_workers=args.num_workers,
)

# ------------------------------------------------------------------------------
# Read in dataset and make predictions
# ------------------------------------------------------------------------------

logger.info("Loading queries from %s" % args.dataset)
Esempio n. 11
0
import bottle
from drqa import pipeline
import json
import pandas as pd
app = bottle.Bottle()
query = []
response = ""
DrQA = pipeline.DrQA(
    cuda=False,
    reader_model="/ml/mfe4ml/raghuvan/nlp/code/DrQA/data/reader/single.mdl",
    ranker_config={'options': {'tfidf_path': "/ml/mfe4ml/raghuvan/nlp/code/DrQA/data/datasets/helpbot/mpp/mpp-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"}},
    db_config={'options': {'db_path': "/ml/mfe4ml/raghuvan/nlp/code/DrQA/data/datasets/helpbot/mpp/mpp.db"}}
)

@app.get("/")
def home():
    with open('/ml/mfe4ml/raghuvan/nlp/code/DrQA/scripts/pipeline/demo.html', 'r') as fl:
        html = fl.read()
        return html

@app.post('/answer')
def answer():
    question = bottle.request.json['question']
    print("received question: {}".format(question))

    global  query, response
    predictions = DrQA.process(
        question, candidates=None, top_n=2, n_docs=5, return_context=True
    )
    dfr = pd.DataFrame(predictions)
    print("[info] RESULTS DF: ")
Esempio n. 12
0
import code
from drqa import retriever
from drqa import pipeline
from drqa.retriever import utils

database_path = './rough/DrQA/data/wikipedia/'
conn = sqlite3.connect(database_path + 'docs.db')
cursor = conn.cursor()
print("setting up DrQA")
DrQA = pipeline.DrQA(
    cuda=True,
    fixed_candidates=None,
    reader_model=None,
    ranker_config={
        'options': {
            'tfidf_path':
            '/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz'
        }
    },
    db_config={'options': {
        'db_path': database_path + 'docs.db'
    }},
    tokenizer=None)
#ranker = retriever.get_class('tfidf')(tfidf_path='/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz')
print('ranker loaded')


def get_docs(query, k=1):
    doc_id, doc_score = ranker.closest_docs(query, k)
    for i in range(len(doc_score)):
        print(str(doc_id[i]) + ' ' + str(doc_score))
    return doc_id
Esempio n. 13
0
    logger.info('Running on CPU only.')

if args.ranker.lower().startswith('s'):
    ranker = retriever.get_class('sql')(db_path=args.db_path)
elif args.ranker.lower().startswith('l'):
    ranker = retriever.get_class('lucene')(index_path=args.db_path)
else:
    ranker = retriever.get_class('tfidf')(tfidf_path=args.retriever_model,
                                          db_path=args.db_path)

logger.info('Initializing pipeline...')
DrQA = pipeline.DrQA(reader_model=args.reader_model,
                     normalize=args.normalize,
                     tokenizer=args.tokenizer,
                     batch_size=args.batch_size,
                     cuda=args.cuda,
                     data_parallel=args.parallel,
                     ranker=ranker,
                     num_workers=args.num_workers,
                     et_model=args.et_model,
                     et_threshold=args.et_threshold)

# ------------------------------------------------------------------------------
# Read in dataset and make predictions
# ------------------------------------------------------------------------------

logger.info('Loading queries from %s' % args.dataset)
queries = []
for line in open(args.dataset):
    data = json.loads(line)
    queries.append(data['question'])
Esempio n. 14
0
                    default=None,
                    help='Path to Document DB or index')

args = parser.parse_args()

args.cuda = not args.no_cuda and torch.cuda.is_available()
if args.cuda:
    torch.cuda.set_device(args.gpu)
    logger.info('CUDA enabled (GPU %d)' % args.gpu)
else:
    logger.info('Running on CPU only.')

logger.info('Initializing pipeline...')
DrQA = pipeline.DrQA(cuda=args.cuda,
                     reader_model=args.reader_model,
                     normalize=args.normalize,
                     ranker=LuceneRanker,
                     tokenizer=args.tokenizer)

# ------------------------------------------------------------------------------
# Drop in to interactive mode
# ------------------------------------------------------------------------------


def process(question, top_n=1, n_docs=5):
    predictions = DrQA.process_single(question,
                                      top_n,
                                      n_docs,
                                      return_context=True)
    table = prettytable.PrettyTable(
        ['Rank', 'Answer', 'Doc', 'Answer Score', 'Doc Score'])
Esempio n. 15
0
    def process(self, question, candidates=None, top_n=3, n_docs=10):

        print('retriever_model_arg:',self.retriever_model_arg)
        #Start Modification 09/03/2018
        #Set a environnement variable
        import drqa.tokenizers
        drqa.tokenizers.set_default('corenlp_classpath', '/home/ubuntu/spacework/DrQA/data/corenlp/*')
        # end modification

        logger = logging.getLogger()
        logger.setLevel(logging.INFO)
        fmt = logging.Formatter('%(asctime)s: [ %(message)s ]', '%m/%d/%Y %I:%M:%S %p')
        console = logging.StreamHandler()
        console.setFormatter(fmt)
        logger.addHandler(console)

        if self.retriever_model_arg is None:
            self.retriever_model_arg = '/home/ubuntu/spacework/DrQA/data/gdpr/gdpr_all_en_articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz'

        if self.doc_db_arg is None:
            self.doc_db_arg = '/home/ubuntu/spacework/DrQA/data/gdpr/gdpr_all_en_articles.db'

        # Comment the arguments
        # parser = argparse.ArgumentParser()
        # parser.add_argument('--reader-model', type=str, default=None,
        #                     help='Path to trained Document Reader model')
        # parser.add_argument('--retriever-model', type=str, default=retriever_model,
        #                     help='Path to Document Retriever model (tfidf)')
        # parser.add_argument('--doc-db', type=str, default=doc_db,
        #                     help='Path to Document DB')
        # parser.add_argument('--tokenizer', type=str, default=None,
        #                     help=("String option specifying tokenizer type to "
        #                           "use (e.g. 'corenlp')"))
        # parser.add_argument('--candidate-file', type=str, default=None,
        #                     help=("List of candidates to restrict predictions to, "
        #                           "one candidate per line"))
        # parser.add_argument('--no-cuda', action='store_true',
        #                     help="Use CPU only")
        # parser.add_argument('--gpu', type=int, default=-1,
        #                     help="Specify GPU device id to use")
        # args = parser.parse_args()
        #end comment arguments

        # Modification 09/03/2018
        # changa the args
        cuda_arg = not self.no_cuda_arg and torch.cuda.is_available()
        if cuda_arg:
            torch.cuda.set_device(self.gpu_arg)
            logger.info('CUDA enabled (GPU %d)' % self.gpu_arg)
        else:
            logger.info('Running on CPU only.')

        if self.candidate_file_arg:
            logger.info('Loading candidates from %s' % self.candidate_file_arg)
            candidates = set()
            with open(self.candidate_file_arg) as f:
                for line in f:
                    line = utils.normalize(line.strip()).lower()
                    candidates.add(line)
            logger.info('Loaded %d candidates.' % len(candidates))
        else:
            candidates = None

        print('DrQA:',self.DrQA)
        if self.DrQA is None:
            logger.info('Initializing pipeline...')

            self.DrQA = pipeline.DrQA(
                cuda=cuda_arg,
                fixed_candidates=candidates,
                reader_model=self.reader_model_arg,
                ranker_config={'options': {'tfidf_path': self.retriever_model_arg}},
                db_config={'options': {'db_path': self.doc_db_arg}},
                tokenizer=self.tokenizer_arg
            )

        predictions = self.DrQA.process(question, candidates, top_n, n_docs, return_context=True)

        table = prettytable.PrettyTable(['Rank', 'Answer', 'Doc', 'Answer Score', 'Doc Score'])

        dico_result_list = []

        for i, p in enumerate(predictions, 1):
            table.add_row([i, p['span'], p['doc_id'], '%.5g' % p['span_score'], '%.5g' % p['doc_score']])
            dico_result = {}
            dico_result['answer'] = p['span']
            dico_result['docid'] = p['doc_id']
            dico_result['docscore'] = p['span_score']
            dico_result['answerscore'] = p['doc_score']

            text = p['context']['text']
            start = p['context']['start']
            end = p['context']['end']
            output = (text[:start] +
                      colored(text[start: end], 'green', attrs=['bold']) +
                      text[end:])

            dico_result['doc'] = output

            dico_result_list.append(dico_result)

        print('Top Predictions:')
        print(table)
        print('\nContexts:')
        for p in predictions:
            text = p['context']['text']
            start = p['context']['start']
            end = p['context']['end']
            output = (text[:start] +
                      colored(text[start: end], 'green', attrs=['bold']) +
                      text[end:])
            print('[ Doc = %s ]' % p['doc_id'])
            print(output + '\n')

        for dico in dico_result_list:
            print(dico)

        return dico_result_list

# banner = """
# Interactive DrQA
# >> process(question, candidates=None, top_n=1, n_docs=5)
# >> usage()
# """


# def usage():
#     print(banner)


# code.interact(banner=banner, local=locals())
Esempio n. 16
0
import prettytable
import logging
import os

from termcolor import colored
from drqa import pipeline
from drqa.retriever import utils

print("import done!")
os.system("export CLASSPATH=$CLASSPATH:/home/shellphish/DrQA/data/corenlp/*")
#logger.info('Initializing pipeline...')
DrQA = pipeline.DrQA(cuda=None,
                     fixed_candidates=None,
                     reader_model=None,
                     ranker_config={'options': {
                         'tfidf_path': None
                     }},
                     db_config={'options': {
                         'db_path': None
                     }},
                     tokenizer=None)

print("Pipeline ready")

# ------------------------------------------------------------------------------
# Drop in to interactive mode
# ------------------------------------------------------------------------------


def process(question, candidates=None, top_n=1, n_docs=10):
    print("Processing")
    predictions = DrQA.process(question,
Esempio n. 17
0
    candidates = set()
    with open(args.candidate_file) as f:
        for line in f:
            line = utils.normalize(line.strip()).lower()
            candidates.add(line)
    logger.info('Loaded %d candidates.' % len(candidates))
else:
    candidates = None

logger.info('Initializing pipeline...')
DrQA = pipeline.DrQA(
    cuda=args.cuda,
    fixed_candidates=candidates,
    reader_model=args.reader_model,
    ranker_config={'options': {'tfidf_path': args.retriever_model}},
    db_config={'options': {'db_path': args.doc_db}},
    tokenizer=args.tokenizer,
    num_workers=1,
    max_loaders=1,
    embedding_file='data/vector/zh200.vec'
)


# ------------------------------------------------------------------------------
# Drop in to interactive mode
# ------------------------------------------------------------------------------


def process(question, candidates=None, top_n=1, n_docs=5):
    predictions = DrQA.process(
        question, candidates, top_n, n_docs, return_context=True
Esempio n. 18
0
        #print(output + '\n')
    return answers

# # # # # FUNCOES # # # # # 

# Instanciando DrQA com a base de dados e modelo Wikipedia
#drqaDir = '../DrQA'
#reader_model = drqaDir + '/data/reader/multitask.mdl'
#retriever_model = drqaDir + '/data/wikipedia/docs-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz'
#doc_db = drqaDir + '/data/wikipedia/docs.db'
#tokenizer = 'corenlp'
 
# Carregando modelo e base Wikipedia
if os.environ.get("WERKZEUG_RUN_MAIN") == "true":
    print('Carregando modelo de QA e base Wikipedia/2016...', end = '')
    DrQA = pipeline.DrQA( cuda = torch.cuda.is_available() )
#
# DrQA = pipeline.DrQA(
#     cuda = torch.cuda.is_available(), # Disponibilidade do CUDA (proc. paralelo)
#     fixed_candidates = None,
#     reader_model = reader_model,
#     ranker_config = {'options': {'tfidf_path': retriever_model}},
#     db_config = {'options': {'db_path': doc_db}},
#     tokenizer = tokenizer
# )
    print(' Ok!')

app = Flask(__name__)

# for CORS
@app.after_request