예제 #1
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)

        if configs.tag_formalism not in MODEL2URL:
            raise ProcessorConfigError('Incorrect value for tag_formalism')
        if configs.tag_formalism == 'stanford':
            self.predictor = {
                'stanford': Predictor.from_path(MODEL2URL['stanford'])}
        if 'srl' in configs.processors:
            self.predictor = {
                'stanford': Predictor.from_path(MODEL2URL['stanford']),
                'srl': Predictor.from_path(MODEL2URL['srl'])}

        if configs.overwrite_entries:
            logger.warning("`overwrite_entries` is set to True, this means "
                           "that the entries of the same type as produced by "
                           "this processor will be overwritten if found.")
            if configs.allow_parallel_entries:
                logger.warning('Both `overwrite_entries` (whether to overwrite '
                               'the entries of the same type as produced by '
                               'this processor) and '
                               '`allow_parallel_entries` (whether to allow '
                               'similar new entries when they already exist) '
                               'are True, all existing conflicting entries '
                               'will be deleted.')
        else:
            if not configs.allow_parallel_entries:
                logger.warning('Both `overwrite_entries` (whether to overwrite '
                               'the entries of the same type as produced by '
                               'this processor) and '
                               '`allow_parallel_entries` (whether to allow '
                               'similar new entries when they already exist) '
                               'are False, processor will only run if there '
                               'are no existing conflicting entries.')
예제 #2
0
    def __init__(self,
                 wd=None,
                 doc=None,
                 debug=False,
                 use_v=False,
                 vfile=None,
                 cfile=None,
                 efile=None):
        if os.path.exists('/homes/du113/scratch/pretrained'):
            print('models already downloaded')
            self.srl_predictor = Predictor.from_path(
                '/homes/du113/scratch/pretrained/srl-model-2018.05.25.tar.gz')
            self.coref_predictor = Predictor.from_path(
                '/homes/du113/scratch/pretrained/coref-model-2018.02.05.tar.gz'
            )
        else:
            print('downloading models...')
            self.srl_predictor = Predictor.from_path(
                "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz"
            )
            self.coref_predictor = Predictor.from_path(
                "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz"
            )

        self.word_dict = wd
        self.doc = doc
        self.debug = debug
        self.use_v = use_v
        '''
예제 #3
0
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        if ("pos" in configs.processors or "depparse" in configs.processors
                or "depparse" in configs.processors):
            if "tokenize" not in self.configs.processors:
                raise ProcessorConfigError("tokenize is necessary in "
                                           "configs.processors for "
                                           "pos, depparse or srl")
        cuda_devices = itertools.cycle(configs["cuda_devices"])
        if configs.tag_formalism not in MODEL2URL:
            raise ProcessorConfigError("Incorrect value for tag_formalism")
        if configs.tag_formalism == "stanford":
            self.predictor = {
                "stanford":
                Predictor.from_path(configs["stanford_url"],
                                    cuda_device=next(cuda_devices))
            }
        if "srl" in configs.processors:
            self.predictor = {
                "stanford":
                Predictor.from_path(configs["stanford_url"],
                                    cuda_device=next(cuda_devices)),
                "srl":
                Predictor.from_path(configs["srl_url"],
                                    cuda_device=next(cuda_devices)),
            }

        if configs.overwrite_entries:
            logger.warning("`overwrite_entries` is set to True, this means "
                           "that the entries of the same type as produced by "
                           "this processor will be overwritten if found.")
            if configs.allow_parallel_entries:
                logger.warning("Both `overwrite_entries` (whether to overwrite"
                               " the entries of the same type as produced by "
                               "this processor) and "
                               "`allow_parallel_entries` (whether to allow "
                               "similar new entries when they already exist) "
                               "are True, all existing conflicting entries "
                               "will be deleted.")
        else:
            if not configs.allow_parallel_entries:
                logger.warning("Both `overwrite_entries` (whether to overwrite"
                               " the entries of the same type as produced by "
                               "this processor) and "
                               "`allow_parallel_entries` (whether to allow "
                               "similar new entries when they already exist) "
                               "are False, processor will only run if there "
                               "are no existing conflicting entries.")
예제 #4
0
    def initialize(self, resource: Resources, configs: HParams):
        self.processors = configs.processors
        if self.processors is None or self.processors == "":
            self.processors = self.default_configs()['processors']

        if configs.output_format not in MODEL2URL:
            raise ProcessorConfigError('Incorrect value for output_format')
        model_url = MODEL2URL[configs.output_format]
        self.predictor = Predictor.from_path(model_url)

        self.overwrite_entries = configs.overwrite_entries
        self.allow_parallel_entries = configs.allow_parallel_entries
        if self.overwrite_entries:
            logger.warning("`overwrite_entries` is set to True, this means "
                           "that the entries of the same type as produced by "
                           "this processor will be overwritten if found.")
            if self.allow_parallel_entries:
                logger.warning(
                    'Both `overwrite_entries` (whether to overwrite '
                    'the entries of the same type as produced by '
                    'this processor) and '
                    '`allow_parallel_entries` (whether to allow '
                    'similar new entries when they already exist) '
                    'are True, all existing conflicting entries '
                    'will be deleted.')
        else:
            if not self.allow_parallel_entries:
                logger.warning(
                    'Both `overwrite_entries` (whether to overwrite '
                    'the entries of the same type as produced by '
                    'this processor) and '
                    '`allow_parallel_entries` (whether to allow '
                    'similar new entries when they already exist) '
                    'are False, processor will only run if there '
                    'are no existing conflicting entries.')
예제 #5
0
def allennlp(ques, embd_technique):
    text = ''
    if embd_technique == 'word2vec':
        word2vec_wv = model_manager.load_model(model='word2vec')

        text = ' '.join([s.strip() for s in main.find_documents_word2vec(ques, word2vec_wv, basic_search=False)])
    elif embd_technique == 'doc2vec':
        docvec_model = model_manager.load_model(model='doc2vec')

        text = ' '.join([s.strip() for s in main.find_documents_doc2vec(ques, docvec_model, basic_search=False)])
    elif embd_technique == 'fasttext':
        fasttext_wv = model_manager.load_model(model='fasttext')

        text = ' '.join([s.strip() for s in main.find_documents_fasttext(ques, fasttext_wv, basic_search=False)])
    elif embd_technique == 'tfidf':
        text = ' '.join([s.strip() for s in main.find_documents_tfidf(ques, basic_search=False)])
    elif embd_technique == 'glove':
        text = ' '.join([s.strip() for s in main.find_documents_glove(ques, basic_search=False)])

    print('Text:', text)

    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2020.03.19.tar.gz")
    prediction = predictor.predict(passage=text, question=ques)['best_span_str']

    print('Answer:', prediction)
    
    return prediction
예제 #6
0
    def test_constituency_parsing(self):
        predictor = Predictor.from_path(
            self.pretrained_models["structured-prediction-constituency-parser"].archive_file
        )

        sentence = """Pierre Vinken died aged 81; immortalised aged 61."""

        result = predictor.predict_json({"sentence": sentence})

        assert result["tokens"] == [
            "Pierre",
            "Vinken",
            "died",
            "aged",
            "81",
            ";",
            "immortalised",
            "aged",
            "61",
            ".",
        ]
        assert (
            result["trees"]
            == "(S (NP (NNP Pierre) (NNP Vinken)) (VP (VP (VBD died) (NP (JJ aged) (CD 81))) (, ;) (VP (VBN immortalised) (S (ADJP (JJ aged) (CD 61))))) (. .))"
        )
예제 #7
0
    def test_textual_entailment(self):
        predictor = Predictor.from_path(
            self.pretrained_models["pair-classification-decomposable-attention-elmo"].archive_file
        )

        result = predictor.predict_json(
            {
                "premise": "An interplanetary spacecraft is in orbit around a gas giant's icy moon.",
                "hypothesis": "The spacecraft has the ability to travel between planets.",
            }
        )

        assert result["label_probs"][0] > 0.7  # entailment

        result = predictor.predict_json(
            {
                "premise": "Two women are wandering along the shore drinking iced tea.",
                "hypothesis": "Two women are sitting on a blanket near some rocks talking about politics.",
            }
        )

        assert result["label_probs"][1] > 0.8  # contradiction

        result = predictor.predict_json(
            {
                "premise": "A large, gray elephant walked beside a herd of zebras.",
                "hypothesis": "The elephant was lost.",
            }
        )

        assert result["label_probs"][2] > 0.6  # neutral
예제 #8
0
    def __init__(self, ner_model: str, database_db: str, cuda_device: Union[List[int], int] = -1):
        self._ner_predictor = Predictor.from_path(ner_model)

        if cuda_device != -1:
            self._ner_predictor._model = self._ner_predictor._model.to(cuda_device)
        self._database_db = database_db
        self._cuda_device = cuda_device
예제 #9
0
def _convert_deft_folder(input_path: Path,
                         output_file: TextIO,
                         cuda_device: int,
                         with_spacy: bool = True,
                         with_coref: bool = True) -> None:
    """Convert all files in the given folder."""
    if with_spacy:
        spacy_pipeline = spacy.load('en_core_web_lg')
        spacy_matcher = Matcher(spacy_pipeline.vocab)
        for pattern in SPACY_PATTERNS:
            flag = pattern['pattern_flag']
            Token.set_extension(flag, default=False, force=True)
            patterns = pattern['patterns']
            callback_fn = _get_extension_labeling_fn(flag)
            spacy_matcher.add(flag, callback_fn, *patterns)
    else:
        spacy_pipeline = None
        spacy_matcher = None
    if with_coref:
        coref_predictor = Predictor.from_path(
            archive_path="https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz",
            cuda_device=cuda_device
        )
        # Fix issues with characters tokens smaller than the biggest convolution size
        coref_predictor._dataset_reader._token_indexers['token_characters']._min_padding_length = 5
    else:
        coref_predictor = None
    for input_file in tqdm(input_path.iterdir()):
        examples = _convert_deft_file(input_file,
                                      spacy_pipeline=spacy_pipeline,
                                      spacy_matcher=spacy_matcher,
                                      coref_predictor=coref_predictor)
        for example in examples:
            output_file.write(json.dumps(example) + '\n')
예제 #10
0
    def __init__(self, model_dir_path, cuda_device=-1):
        SimpleAllenNLPClassifier.__init__(self, model_dir_path, cuda_device)

        self._model = Predictor.from_path(os.path.join(self.model_dir_path,
                                                       'model.tar.gz'),
                                          predictor_name='textual-entailment',
                                          cuda_device=self._cuda_device)
예제 #11
0
def bidaf(tkt):
    sp3 = """EXEC [dbo].[SPCRM]  @TicketNumber=?"""
    sp5 = """EXEC [dbo].[SPInvVref_hs_v2]  @Inv=?"""
    params = tkt
    cursor.execute(sp3, params)
    column_names_list = [x[0] for x in cursor.description]
    result_dicts = [
        dict(zip(column_names_list, row)) for row in cursor.fetchall()
    ]
    bidaf_df = pd.DataFrame(result_dicts)
    masterinv = defaultdict(list)
    model = Predictor.from_path("bidaf.tar.gz")
    question1 = "what is the invoice number?"
    #question2 = "what is the customer number?"
    #question3 = "what is the reference number?"
    invref = pd.DataFrame()
    for i, row in bidaf_df.iterrows():
        #print(row)
        inv_no1 = model.predict(question1, row['description'])["best_span_str"]
        #print(inv_no1)
        #inv_no2=model.predict(question2,row['description'])["best_span_str"]
        #inv_no3=model.predict(question3,row['description'])["best_span_str"]
        cursor.execute(sp5, inv_no1)
        column_names_list = [x[0] for x in cursor.description]
        result_dicts = [
            dict(zip(column_names_list, row)) for row in cursor.fetchall()
        ]
        #print(result_dicts)
        invref = pd.DataFrame(result_dicts)
    return invref
예제 #12
0
    def setUp(self):
        self.allens = {
            # TODO: Current download model is wrong on Allennlp.
            # 'universal': Predictor.from_path(MODEL2URL['universal']),
            'stanford': Predictor.from_path(MODEL2URL['stanford'])
        }

        self.results = {}
        for k in self.allens:
            self.results[k] = {}

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP pipelines.",
            "NLP has never been made this easy before.",
            "Forte is named Forte because it is designed for text."
        ]
        self.document = ' '.join(sentences)

        for k in self.allens:
            self.results[k]['tokens'] = []
            self.results[k]['pos'] = []
            self.results[k]['dep_types'] = []
            self.results[k]['dep_heads'] = []

        for dep_type in self.allens.keys():
            for sent in sentences:
                results = self.allens[dep_type].predict(sentence=sent)

                self.results[dep_type]['tokens'].append(results['words'])
                self.results[dep_type]['pos'].append(results['pos'])
                self.results[dep_type]['dep_types'].append(
                    results['predicted_dependencies'])
                self.results[dep_type]['dep_heads'].append(
                    results['predicted_heads'])
예제 #13
0
    def setUp(self):
        self.allens = {
            # TODO: Current download model is wrong on Allennlp.
            # 'universal': Predictor.from_path(MODEL2URL['universal']),
            "stanford": Predictor.from_path(MODEL2URL["stanford"])
        }

        self.results = {}
        for k in self.allens:
            self.results[k] = {}
        self.results["srl"] = {}

        sentences = [
            "This tool is called Forte.",
            "The goal of this project is to help you build NLP pipelines.",
            "NLP has never been made this easy before.",
            "Forte is named Forte because it is designed for text.",
        ]
        self.document = " ".join(sentences)

        for k in self.allens:
            self.results[k]["tokens"] = []
            self.results[k]["pos"] = []
            self.results[k]["dep_types"] = []
            self.results[k]["dep_heads"] = []
        self.results["srl"]["verbs"] = []
        self.results["srl"]["srl_tags"] = []

        for sent in sentences:
            for dep_type in self.allens.keys():
                results = self.allens[dep_type].predict(  # type: ignore
                    sentence=sent
                )
                self.results[dep_type]["tokens"].append(results["words"])
                self.results[dep_type]["pos"].append(results["pos"])
                self.results[dep_type]["dep_types"].append(
                    results["predicted_dependencies"]
                )
                self.results[dep_type]["dep_heads"].append(
                    results["predicted_heads"]
                )
            srl_predictor = Predictor.from_path(MODEL2URL["srl"])
            srl_results = parse_allennlp_srl_results(
                srl_predictor.predict(sentence=sent)["verbs"]
            )
            self.results["srl"]["verbs"].append(srl_results["verbs"])
            self.results["srl"]["srl_tags"].append(srl_results["srl_tags"])
예제 #14
0
파일: relation.py 프로젝트: KabyleAI/Search
    def __init__(self, model_path):
        # Note: SciBERT is imported but unused. This is because the import has
        # a side-effect of registering the SciBERT model, which we use later on.
        import scibert  # NOQA
        from allennlp.predictors import Predictor

        self.model_ = Predictor.from_path(model_path,
                                          predictor_name="text_classifier")
예제 #15
0
def _get_predictor(args: argparse.Namespace) -> Predictor:
    check_for_gpu(args.cuda_device)
    return Predictor.from_path(
        args.archive_path,
        predictor_name=args.predictor,
        cuda_device=args.cuda_device,
        overrides=args.overrides,
    )
예제 #16
0
 def __init__(self, database_path, add_claim=False, k_wiki_results=None):
     self.db = FeverDocDB(database_path)
     self.add_claim = add_claim
     self.k_wiki_results = k_wiki_results
     self.proter_stemm = nltk.PorterStemmer()
     self.tokenizer = nltk.word_tokenize
     self.predictor = Predictor.from_path(
         "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")
예제 #17
0
    def setUp(self):
        self.allens = {
            'universal_dependencies':
            Predictor.from_path(MODEL2URL['universal_dependencies']),
            'stanford_dependencies':
            Predictor.from_path(MODEL2URL['stanford_dependencies'])
        }

        univ = Predictor.from_path(MODEL2URL['universal_dependencies'])
        stan = Predictor.from_path(MODEL2URL['stanford_dependencies'])

        sentences = [
            "This tool is called Forte.",
            "The goal of this project to help you build NLP pipelines.",
            "NLP has never been made this easy before."
        ]
        self.document = ' '.join(sentences)

        self.tokens = []
        self.pos = {'stanford_dependencies': [], 'universal_dependencies': []}
        self.dep_types = {
            'stanford_dependencies': [],
            'universal_dependencies': []
        }
        self.dep_heads = {
            'stanford_dependencies': [],
            'universal_dependencies': []
        }

        for sent in sentences:
            univ_results = univ.predict(sentence=sent)
            stan_results = stan.predict(sentence=sent)
            self.tokens.append(univ_results['words'])

            self.pos['universal_dependencies'].append(univ_results['pos'])
            self.dep_types['universal_dependencies'].append(
                univ_results['predicted_dependencies'])
            self.dep_heads['universal_dependencies'].append(
                univ_results['predicted_heads'])

            self.pos['stanford_dependencies'].append(stan_results['pos'])
            self.dep_types['stanford_dependencies'].append(
                stan_results['predicted_dependencies'])
            self.dep_heads['stanford_dependencies'].append(
                stan_results['predicted_heads'])
예제 #18
0
 def __init__(self):
     #use the model from allennlp for simlicity.
     self.predictor = Predictor.from_path(
         "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz"
     )
     if torch.cuda.is_available():
         self.predictor._model = self.predictor._model.cuda()
     else:
         self.predictor._model = self.predictor._model
예제 #19
0
 def make(cls, path, gpu=False):
     try:
         from allennlp_models.rc.models.bidaf import BidirectionalAttentionFlow
     except:
         import allennlp_models.rc
     cuda_device = 0 if gpu else -1
     return cls("BiDAF",
                Predictor.from_path(path, cuda_device=cuda_device),
                gpu=gpu)
예제 #20
0
 def test_openie(self):
     predictor = Predictor.from_path(
         self.pretrained_models["structured-prediction-srl"].archive_file
     )
     result = predictor.predict_json(
         {"sentence": "I'm against picketing, but I don't know how to show it."}
     )
     assert "verbs" in result
     assert "words" in result
예제 #21
0
 def __init__(self, model_dir_path, cuda_device=-1):
     self._model_path = os.path.join(model_dir_path, 'segmenter_neural', 'model.tar.gz')
     self._cuda_device = cuda_device
     self.predictor = Predictor.from_path(self._model_path, cuda_device=self._cuda_device)
     self.predictor._tokenizer = WhitespaceTokenizer()
     self._separator = 'U-S'
     self._threshold = 0.5
     self._use_logits = False
     self._symbol_map = SYMBOL_MAP
예제 #22
0
    def __init__(self):
        # ALLEN NLP Corereference pre-trained model
        pretrained_coref_path = './allennlp_pretrained/allennlp_coref-model-2018.02.05.tar.gz'

        if not os.path.exists(pretrained_coref_path):
            coref_url = "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz"
            os.mkdir('allennlp_pretrained')
            wget.download(coref_url, pretrained_coref_path)

        self.predictor = Predictor.from_path(pretrained_coref_path)
예제 #23
0
def load_model(slug):
    """Return an AllenNLP Predictor for the trained model for `slug`."""
    assert is_trained(slug), "We haven't trained a model to load yet"
    model_file_name = os.path.join(SERIALIZATION_DIR, slug, "model.tar.gz")
    PREDICTOR_NAMES = {
        "pos-tagging": "sentence-tagger",
        "translation": "seq2seq",
        "classification": "text_classifier",
    }
    name = PREDICTOR_NAMES[slug]
    return Predictor.from_path(model_file_name, predictor_name=name)
예제 #24
0
class SRL:
    # predictor = Predictor.from_path("/root/.allennlp/models/srl-model-2018.05.25.tar.gz")
    predictor = Predictor.from_path("./models/srl-model-2018.05.25.tar.gz")

    @staticmethod
    def get_srl(document):
        if SRL.validate_doc(document):
            return SRL.predictor.predict(document)

    @staticmethod
    def validate_doc(document):
        return True
 def start_bundle(self):
     if self.predictor is not None:
         return
     model_dir = self.prepare_model()
     # the following line is a necessary bad import practice, otherwise beam tries to serialize allennlp and the
     # deserialization breaks on dataflow.
     from scibert.models import text_classifier
     from scibert.predictors.predictor import ScibertPredictor
     from allennlp.predictors import Predictor
     import scibert
     self.predictor = Predictor.from_path(model_dir,
                                          predictor_name="text_classifier")
예제 #26
0
class SRL:
    predictor = Predictor.from_path("./models/srl-model-2018.05.25.tar.gz")

    @staticmethod
    def get_srl(document):
        if SRL.validate_doc(document):
            return SRL.predictor.predict(document)

    @staticmethod
    def validate_doc(document):
        """ Any validations should be carried out here """
        return True
def my_sample_fever():
    logger = logging.getLogger()
    dictConfig({
        'version': 1,
        'formatters': {
            'default': {
                'format':
                '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
            }
        },
        'handlers': {
            'wsgi': {
                'class': 'logging.StreamHandler',
                'stream': 'ext://sys.stderr',
                'formatter': 'default'
            }
        },
        'root': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
        'allennlp': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
    })

    logger.info("Columbia FEVER application")
    config = json.load(
        open(os.getenv("CONFIG_PATH", "configs/system_config.json")))

    ner_predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/fine-grained-ner-model-elmo-2018.12.21.tar.gz"
    )
    google_config = GoogleConfig(**config['retrieval']['google'])
    ranker = retriever.get_class('tfidf')(
        tfidf_path=config['retrieval']['tfidf']['index'])

    predictors = {}
    for key in ('page_model', 'state_model'):
        path = config[key].pop('path')
        predictors[key] = ColumbiaPredictor(path, config['cuda_device'],
                                            **config[key])

    # The prediction function that is passed to the web server for FEVER2.0
    def predict(instances):
        predictions = getDocsSingle(instances, google_config, ner_predictor,
                                    ranker)
        for key in ('page_model', 'state_model'):
            predictions = list(predictors[key].predict(predictions))
        return predictions

    return fever_web_api(predict)
예제 #28
0
def setup_route(app, route, model):
    allennlp_model = Predictor.from_path(model)

    @app.route(f'/{route}', methods=['POST'])
    def serve_route():
        data = request.get_json()
        if data is None:
            return jsonify({})
        text = data.get('text', None)
        if text is None:
            return jsonify({})
        return jsonify({'results': allennlp_model.predict(sentence=text)})
예제 #29
0
    def test_machine_comprehension(self):
        predictor = Predictor.from_path(self.pretrained_models["rc-bidaf"].archive_file)

        passage = """The Matrix is a 1999 science fiction action film written and directed by The Wachowskis, starring Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, and Joe Pantoliano. It depicts a dystopian future in which reality as perceived by most humans is actually a simulated reality called "the Matrix", created by sentient machines to subdue the human population, while their bodies' heat and electrical activity are used as an energy source. Computer programmer Neo" learns this truth and is drawn into a rebellion against the machines, which involves other people who have been freed from the "dream world". """
        question = "Who stars in The Matrix?"

        result = predictor.predict_json({"passage": passage, "question": question})

        correct = (
            "Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, and Joe Pantoliano"
        )

        assert correct == result["best_span_str"]
예제 #30
0
    def __init__(self, database_path, add_claim=False, k_wiki_results=None):
        self.add_claim = add_claim
        self.k_wiki_results = k_wiki_results
        self.proter_stemm = nltk.PorterStemmer()
        self.tokenizer = nltk.word_tokenize
        self.predictor = Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")

        self.db = {}
        for file in os.listdir(database_path):
            with open(os.path.join(database_path, file), 'r', encoding='utf-8') as f:
                for line in f:
                    line_json = json.loads(line.strip())
                    self.db[line_json['id']] = line_json['lines']