Ejemplo n.º 1
0
def modelsConfig_qa(model):
    ## Question Answering:
    if model == "ELMo-BiDAF (Trained on SQuAD)":
        model_selected = Predictor.from_path(
            "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2020.03.19.tar.gz"
        )

    elif model == "BiDAG (Trained on SQuAD)":
        model_selected = Predictor.from_path(
            "https://storage.googleapis.com/allennlp-public-models/bidaf-model-2020.03.19.tar.gz"
        )

    elif model == "Transformer QA (Trained on SQuAD)":
        model_selected = Predictor.from_path(
            "https://storage.googleapis.com/allennlp-public-models/transformer-qa-2020-05-26.tar.gz"
        )

    elif model == "distilbert-base-cased-distilled-squad":
        model_selected = qa_pipeline("question-answering", model=f"{model}")

    elif model == "bert-large-uncased-whole-word-masking-finetuned-squad":
        model_selected = qa_pipeline("question-answering", model=f"{model}")

    # Multilingual:
    elif model == "mrm8488/bert-multi-cased-finetuned-xquadv1 [multilingual]":
        model = "mrm8488/bert-multi-cased-finetuned-xquadv1"
        model_selected = qa_pipeline("question-answering", model=f"{model}")

    else:
        raise Exception("Not a valid model")
    return model_selected
Ejemplo n.º 2
0
    def __init__(self, ner=None, coref=None, dep=None):

        self.ner = ner if ner else Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz")

        self.coref = coref if coref else Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz")
        self.dep = dep if dep else Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz")
Ejemplo n.º 3
0
 def load(self, name):
     if name == 'machine_comprehension' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path,
                          "bidaf-model-2017.09.15-charpad.tar.gz"))
     if name == 'named_entity_recognition' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path, "ner-model-2018.12.18.tar.gz"))
     if name == 'textual_entailment' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path,
                          "decomposable-attention-elmo-2018.02.19.tar.gz"))
     if name == 'coreference_resolution' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path,
                          "coref-model-2018.02.05.tar.gz"))
     if name == 'semantic_role_labeling' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path, "srl-model-2018.05.25.tar.gz"))
     if name == 'constituency_parsing' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path,
                          "elmo-constituency-parser-2018.03.14.tar.gz"))
     if name == 'dependency_parsing' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(
                 self.models_path,
                 "biaffine-dependency-parser-ptb-2018.08.23.tar.gz"))
     if name == 'open_information_extraction' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path,
                          "openie-model.2018-08-20.tar.gz"))
     if name == 'event2mind' and not self.is_loaded(name):
         self.models[name].model = Predictor.from_path(
             os.path.join(self.models_path, "event2mind-2018.09.17.tar.gz"))
Ejemplo n.º 4
0
 def __init__(
     self,
     model_id: str,
 ):
     try:
         self.predictor = Predictor.from_path("hf://" + model_id)
     except (IOError, OSError):
         nltk = os.getenv("NLTK_DATA")
         if nltk is None:
             raise
         directory = os.path.join(nltk, "corpora")
         shutil.rmtree(directory)
         self.predictor = Predictor.from_path("hf://" + model_id)
    def run(
        self, paragraphs: Dict[str, str], coref_model_path: str, batch_size=8
    ) -> Dict[str, str]:
        """Run SRL extraction
        
        Args:
            paragraphs (Dict[str,str]): id: {paragraph}
        
        Returns:
            Dict[str, str]: return output id: paragraph with coreference resolution
        """

        if torch.cuda.is_available():
            logger.info("GPU found")
            logger.info("Initializing Coreference predictor with GPU")
            predictor = Predictor.from_path(coref_model_path, cuda_device=0)
        else:
            logger.info("Initializing Coreference predictor with CPU")
            predictor = Predictor.from_path(coref_model_path)

        logger.info(f"Batch_size = {batch_size}")
        batches = create_dict_chunks(paragraphs, batch_size)

        resolved_values = {}
        for batch in tqdm(
            batches,
            desc="Running coreference resolution",
            total=math.ceil(len(paragraphs) / batch_size),
        ):
            resolved_values = {
                **resolved_values,
                **{
                    id: val
                    for id, val in zip(
                        batch.keys(),
                        predictor.predict_batch_json(
                            [{"document": paragraph} for paragraph in batch.values()]
                        ),
                    )
                },
            }
        logger.success("Coreference resolution successful")

        logger.info("Resolving Coreference")
        results = {}

        for key, res in resolved_values.items():
            new_paragraph = self.coref_sub(res["document"], res["clusters"])
            results[key] = " ".join(new_paragraph)

        return results
def main():
    if not (args.input and args.output):
        raise RuntimeError(
            'You have to define both input and output files paths')

    with open(args.input) as json_file:
        coref_model = Predictor.from_path(
            "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
        )
        entity_resol_model = Predictor.from_path(
            "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz"
        )
        pos_tagging_model = spacy.load("en_core_web_lg")
        window = 4  # Coreference resolution on up to 5 previous queries
        data = json.load(json_file)
        out = ""
        # print(data)
        for p in data:
            # print(p)
            # desc = p['description']
            current_conv = p['number']
            # print("Description: " + desc + '\n')
            print("Current conv: " + str(current_conv) + '\n')
            # out = out + '\n' + 'Number: ' + str(current_conv) + '\n' + 'Description: ' + desc + '\n'
            u = [p['turn'][0]['raw_utterance']]
            out = out + str(current_conv) + '_1' + '\t' + u[0] + '\n'
            for t in p['turn'][1:]:
                current_turn = t['number']
                if len(u) < window:
                    u.append(t['raw_utterance'])
                else:
                    u.pop(0)
                    u.append(t['raw_utterance'])
                # print(str(current_conv) + '_' + str(current_turn) + ": " + current_utterance)
                print("Input: " + " ".join(u))
                # indexes = indexer(u)
                resolved, indexes = coref_resol(coref_model,
                                                entity_resol_model,
                                                pos_tagging_model, u)
                u = update_utterances(resolved, indexes)
                print("Output: " + str(u))
                print("---")
                out = out + str(current_conv) + '_' + str(
                    current_turn) + '\t' + u[-1] + '\n'
            print('\n --- --- --- \n')
        print(out)

    output = open(args.output, "w")
    output.write(out)
    output.close()
def dependency_parsing(resolved_corefs, updated_bios):
    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz",
        cuda_device=0
    )
    dps = []
    logging.info("Performing dependency parsing...")
    for paragraph_index, evidences in tqdm(enumerate(resolved_corefs)):
        paragraph_bio = updated_bios[paragraph_index]
        dp = []
        bio_index = 0
        for ev_index, evidence in enumerate(evidences):
            pred = predictor.predict(
                sentence=evidence
            )

            nodes = []
            for i in range(len(pred['words'])):
                try:
                    bio = paragraph_bio[bio_index + i]
                except IndexError:
                    logging.warning("Mismatch between bio data length and paragraph length")
                    bio = 0
                nodes.append({
                    'head': pred['predicted_heads'][i] - 1,
                    'pos': pred['pos'][i],
                    'dep': pred['predicted_dependencies'][i],
                    'word': pred['words'][i],
                    'ans':  bio
                })
            bio_index += len(pred['words'])
            dp.append(nodes)
        dps.append(dp)
    return dps
Ejemplo n.º 8
0
    def __init__(self):
        self.predictor = AllenNLPPredictor.from_path(
            "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2020.03.19.tar.gz", \
                cuda_device=torch.cuda.current_device()
        )

        #         print('file location={}'.format((os.path.dirname(os.path.abspath(__file__)))))
        filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                'data',
                                '1000453_1997-11-14_CREDIT AGREEMENT.txt')
        #         print('cwd={}'.format(filepath))
        with open(filepath, 'r') as credit_doc:
            content = credit_doc.read()

        lines = content.split('\n')

        line_array = []
        max_lines = 1000
        i = 0
        for line in lines:
            if len(line) > 0:
                line_array.append(line)
                i += 1
#                 if i > max_lines:
#                     break

        self.corpus = '\n'.join(line_array)
Ejemplo n.º 9
0
    def __init__(self, model_path=None, cuda_device=1):
        # model_path = model_path or LSTM_MODEL_PATH
        model_path = model_path or ROBERTA_MODEL_PATH
        self.predictor = Predictor.from_path(model_path,
                                             cuda_device=cuda_device)

        _tokenizer = PretrainedTransformerTokenizer(
            model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT)
        class_name_mapper = {"0": "Negative", "1": "Positive"}
        _model = self.predictor._model
        _label_namespace = _model._label_namespace
        class_names = [
            class_name_mapper[_model.vocab.get_index_to_token_vocabulary(
                _label_namespace).get(0)],
            class_name_mapper[_model.vocab.get_index_to_token_vocabulary(
                _label_namespace).get(1)]
        ]
        # reset the tokenizer to remove separators
        self.tokenizer = lambda s: [
            t.text.replace("Ġ", "").replace('Ċ', '').replace('ĉ', "")
            for t in _tokenizer.tokenize(s)
        ][1:-1]
        self.explainer_lime = LimeTextExplainer(
            class_names=class_names, split_expression=self.tokenizer)
        self.explainer_integrate = IntegratedGradient(self.predictor)
        self.explainer_simple = SimpleGradient(self.predictor)
Ejemplo n.º 10
0
def perform_srl(responses, prompt=None):
    """ Perform semantic role labeling on a list of responses, given a prompt.

    Args:
        responses: a list of responses (or full sentences if prompt = None)
        prompt: the prompt that should be added as a prefix to the responses

    Returns: the output of the AllenNLP SRL model

    """

    predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz"
    )

    if prompt:
        sentences = [{
            "sentence": prompt + " " + response
        } for response in responses]
    else:
        sentences = [{"sentence": response} for response in responses]
    output = predictor.predict_batch_json(sentences)

    full_output = [{
        "sentence": prompt + " " + response if prompt else response,
        "response": response,
        "srl": srl
    } for (response, srl) in zip(responses, output)]

    return full_output
def pos_analyze(sentence_data):
    # pdb.set_trace()
    predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz")
    blank_documents = {}
    for i,document in tqdm(enumerate(sentence_data)):
        blank_sentence = {}
        for sentence in document:
            sentence = simplify(sentence,predictor)
            blank_sentence[sentence] = []
            sentence_labels = predictor.predict(sentence=sentence)
            hierplane_tree = sentence_labels['hierplane_tree']
            ADV_result = []
            find_labels(hierplane_tree['root'],"RB","ADVP",ADV_result)
            # blank_sentence[sentence]["ADV"] = []
            for item in ADV_result:
                new_sentence = sentence.replace(item,"[BLANK]")
                blank_sentence[sentence].append(tuple([new_sentence,item,"ADV"]))
            PP_result = []
            find_labels(hierplane_tree['root'],"IGNORE","PP",PP_result)
            # blank_sentence[sentence]["PP"] = []
            for item in PP_result:
                new_sentence = sentence.replace(item,"[BLANK]")
                blank_sentence[sentence].append(tuple([new_sentence,item,"PP"]))
            # print(blank_sentence[sentence])
        blank_documents[i] = blank_sentence
    
    with open("sentences_with_blank.json",'w',encoding="utf8") as f:
        json.dump(blank_documents,f,indent=2,ensure_ascii = False)
    return blank_documents
    
    
Ejemplo n.º 12
0
 def __init__(self,
              name: str,
              model_path: str = None,
              model_online_path: str = None,
              description: str = '',
              model_type: str = None) -> None:
     """A class specifically created for wrapping the predictors from 
     Allennlp: https://allenai.github.io/allennlp-docs/api/allennlp.predictors.html
     
     Parameters
     ----------
     name : str
     The name of the predictor.
     model_path : str, optional
         A local model path if you are using local models, by default None.
         This and ``model_online_path`` cannot both be None.
     model_online_path : str, optional
         An online model path, by default None
     description : str, optional
         A sentence describing the predictor., by default ''
     model_type : str, optional
         The model type as used in Allennlp, by default None
     
     Returns
     -------
     None
     """
     model = None
     if model_path:
         archive = load_archive(model_path)
         model = AllenPredictor.from_archive(archive, model_type)
     elif model_online_path:
         model = AllenPredictor.from_path(model_online_path, model_type)
     self.predictor = model
     Predictor.__init__(self, name, description, model, ['accuracy'])
Ejemplo n.º 13
0
def run_srl(ecb_path: str, data_loader: IDataLoader):
    documents = data_loader.read_data_from_corpus_folder(ecb_path)
    predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz"
    )

    sentences = Doc.to_sentences(documents)
    all_sentence_verbs = list()
    for sentence in sentences:
        srl_sentence = SRLSentence(sentence.doc_id, sentence.sent_id)
        sentence_word = sentence.get_sentence_words()
        prediction = predictor.predict_tokenized(
            tokenized_sentence=sentence_word)
        verbs = prediction['verbs']
        words = prediction['words']
        for verb in verbs:
            srl_verb = SRLVerb()
            tags = verb['tags']
            srl_verb.add_var(tags, words)
            srl_sentence.add_srl_vrb(srl_verb)

        all_sentence_verbs.append(srl_sentence)
        print('Dont with sentence from doc-' + sentence.doc_id + ', withId-' +
              str(sentence.sent_id))

    return all_sentence_verbs
Ejemplo n.º 14
0
    def __init__(self, configuration):
        self.max_nodes = configuration["task"]["max_nodes"]
        self.max_query_size = configuration["task"]["max_query_size"]
        self.max_candidates = configuration["task"]["max_candidates"]
        dataset_location = configuration["task"]["dataset_folder"]

        in_file = dataset_location + "wikihop/train.json"
        with open(in_file, 'r') as f:
            self.raw_train_data = json.load(f)

        in_file = dataset_location + "wikihop/dev.json"
        with open(in_file, 'r') as f:
            self.raw_dev_data = json.load(f)

        in_file = dataset_location + "wikihop/dev.json"
        with open(in_file, 'r') as f:
            self.raw_test_data = json.load(f)

        self.nlp = English()
        self.glove_embedder = GloveEmbedder(configuration["preprocessing"]["glove_embeddings"]["file"])
        # self.predictor = Predictor.from_path(
        #    "https://allennlp.s3.amazonaws.com/models/coref-model-2020.02.10.tar.gz")

        # self.predictor = pretrained.load_predictor("coref")
        self.predictor = Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz")
        # self.predictor = pretrained.load_predictor("coref-spanbert")
        # self.predictor = Predictor.from_path(
        #    "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-base-2020.02.27.tar.gz")

        self.evaluator = ChoiceEvaluator()

        AbstractTestProblem.__init__(self, configuration)
Ejemplo n.º 15
0
def _predictor_server(path, qi, qo):
    # Using os.fork() (called from multiprocessing) and allowing cleanups to be run like
    # normal is dangerous because some filesystem-related cleanups might be called
    # twice. That's why we remove them first, without executing them in this process.
    atexit._clear()

    from allennlp.predictors.predictor import Predictor
    from allennlp.predictors.semantic_role_labeler import SemanticRoleLabelerPredictor
    from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter

    # Use en_core_web_md for tokenizer instead
    # TODO: make optional
    SemanticRoleLabelerPredictor.__init__ = Predictor.__init__

    predictor = Predictor.from_path(path)
    predictor._tokenizer = SpacyWordSplitter(language="en_core_web_md",
                                             pos_tags=True)
    while True:
        s = qi.get()
        if s is None:
            break
        qo.put_nowait(predictor.predict(s))

    # We need to manually call atexit callbacks here because the multiprocessing module
    # doesn't call them:
    # https://stackoverflow.com/a/34507557/
    # https://github.com/python/cpython/blob/49fd6dd887df6ea18dbb1a3c0f599239ccd1cb42/Lib/multiprocessing/popen_fork.py#L75
    # But if we don't call them, allennlp leaves extracted archives in the $TMPDIR:
    # https://github.com/allenai/allennlp/blob/fefc439035df87e3d2484eb2f53ca921c4c2e2fe/allennlp/models/archival.py#L176-L178
    logger.debug("atexit should call %d callbacks", atexit._ncallbacks())
    atexit._run_exitfuncs()
Ejemplo n.º 16
0
def init():
    actionList = [
        "activate", "activates", "activated", "inhibit", "inhibits",
        "inhibited", "bind", "binds", "require", "requires", "required",
        "prevent", "prevents", "prevented"
    ]
    constPredictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz"
    )

    text = getInput('CS372_HW4_input_20170441.csv')
    outputFile = open('CS372_HW4_output_20170441.csv',
                      'w',
                      newline='',
                      encoding='utf-8-sig')
    wr = csv.writer(outputFile)
    wr.writerow(
        ["Marked", "Sentence", "Actual Triples", "System Triples", "Cited"])
    for line in text:
        cited = line[0]
        sentence = line[1]
        triples = line[2].split(", ")
        marked = line[3]
        systemTriples = findSVO(sentence, actionList, constPredictor)
        if marked == "Training":
            wr.writerow(
                ["Training Sentence", sentence, triples, systemTriples, cited])
        else:
            wr.writerow(
                ["Test Sentence", sentence, triples, systemTriples, cited])
    outputFile.close()
Ejemplo n.º 17
0
    def __init__(self, data_folder='data/src_data/rest/'):
        self.negative_words_list = [
            'doesn\'t', 'don\'t', 'didn\'t', 'no', 'did not', 'do not',
            'does not', 'not yet', 'not', 'none', 'no one', 'nobody',
            'nothing', 'neither', 'nowhere', 'never', 'hardly', 'scarcely',
            'barely'
        ]
        self.negative_words_list = sorted(self.negative_words_list,
                                          key=lambda s: len(s),
                                          reverse=True)
        self.degree_word_list = [
            'absolutely', 'awfully', 'badly', 'barely', 'completely',
            'decidedly', 'deeply', 'enormously', 'entirely', 'extremely',
            'fairly', 'fully', 'greatly', 'highly', 'incredibly', 'indeed',
            'very', 'really'
        ]
        text = self.read_text([
            os.path.join(data_folder, 'train_sent.json'),
            os.path.join(data_folder, 'dev_sent.json'),
            os.path.join(data_folder, 'test_sent.json'),
        ])

        self.word2idx = self.get_word2id(text)
        self.predictor = Predictor.from_path(
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz"
        )
        self.nlp = stanza.Pipeline(lang='en',
                                   processors='tokenize,mwt,pos',
                                   tokenize_pretokenized=True)
Ejemplo n.º 18
0
def entityRetrieval(query):
    predicts = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz"
    )
    results = predicts.predict_json({"sentence": query})
    entity = []
    wordList = []
    flag = False
    for index, tag in enumerate(results['tags']):
        if len(str(tag)) > 1:
            wordList.append(process().lemmatize(
                results['words'][index].lower()))
            if str(tag).startswith('B-'):
                phrase = process().lemmatize(results['words'][index].lower())
                flag = True
            elif flag:
                phrase += " " + process().lemmatize(
                    results['words'][index].lower())
                if str(tag).startswith('L-'):
                    flag = False
                    entity.append(phrase)
            else:
                entity.append(process().lemmatize(
                    results['words'][index].lower()))
    return entity, wordList
Ejemplo n.º 19
0
 def __init__(self):
   if SRLParser.__instance != None:
       raise Exception("This class is a Singleton!")
   else:
       self.logger = LoggerFactory(self).getLogger()
       self.predictor = Predictor.from_path("libs/allennlp-SRL.tar.gz")
       SRLParser.__instance = self
Ejemplo n.º 20
0
def get_predictors():
    print("current device: {}".format(torch.cuda.current_device()))
    srl_predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz",
        cuda_device=torch.cuda.current_device())

    return srl_predictor
Ejemplo n.º 21
0
def extract_dependency_parse(
    data_file,
    out_file,
    dep_parser_path='/Users/liming/nltk_data/stanford-parser-full-2018-10-17/edu/stanford/nlp/models/parser/nndep/english_UD.gz'
):
    # Load the dependency parser
    dep_parser = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz"
    )  # stanford.StanfordDependencyParser(path_to_models_jar=dep_parser_path)

    with open(data_file, 'r') as f_in,\
         open('{}.json'.format(out_file), 'w') as f_out:
        for ex, line in enumerate(f_in):
            # if ex > 30: # XXX
            #   break
            print('\rExample {}'.format(ex), end='')
            data_dict = json.loads(line)
            sent = data_dict['tokens']

            # Parse the sentence
            sent_len = len(sent)
            parsed_sent = dep_parser.predict(' '.join(sent))
            predicted_labels = parsed_sent['predicted_dependencies']
            predicted_heads = parsed_sent['predicted_heads']
            dep_parse_dict = {
                'predicted_dependencies': predicted_labels,
                'predicted_heads': predicted_heads
            }

            # Save the parse info into the data dict
            data_dict['dep_parse'] = dep_parse_dict
            f_out.write('{}\n'.format(json.dumps(data_dict)))
Ejemplo n.º 22
0
def Bidaf(context, question):
    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/bidaf-model-2020.03.19.tar.gz"
    )
    answer = predictor.predict(passage=context,
                               question=question)["best_span_str"]
    return answer
Ejemplo n.º 23
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('ref_caption_file')
  parser.add_argument('out_file')
  parser.add_argument('--cuda_device', default=-1, type=int)
  opts = parser.parse_args()

  predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz", cuda_device=opts.cuda_device)

  ref_caps = json.load(open(opts.ref_caption_file))
  uniq_sents = set()
  for key, sents in ref_caps.items():
    for sent in sents:
      uniq_sents.add(sent)
  uniq_sents = list(uniq_sents)
  print('unique sents', len(uniq_sents))

  outs = {}
  if os.path.exists(opts.out_file):
    outs = json.load(open(opts.out_file))
  for i, sent in enumerate(uniq_sents):
    if sent in outs:
      continue
    try:
      out = predictor.predict_tokenized(sent.split())
    except KeyboardInterrupt:
      break
    except:
      continue
    outs[sent] = out
    if i % 1000 == 0:
      print('finish %d / %d = %.2f%%' % (i, len(uniq_sents), i / len(uniq_sents) * 100))

  with open(opts.out_file, 'w') as f:
    json.dump(outs, f)
Ejemplo n.º 24
0
def transformer_qna(context, question):
    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/transformer-qa-2020-05-26.tar.gz"
    )
    answer = predictor.predict(passage=context,
                               question=question)["best_span_str"]
    return answer
Ejemplo n.º 25
0
def QA(text, question):
    path_to_model = "/data/bidaf-elmo-model-2020.03.19"
    with open(path_to_text, 'r') as file:
        text = file.read().replace('\n', '')
    predictor = Predictor.from_path(path_to_model)
    result = predictor.predict(text, question)
    return result['best_span_str']
Ejemplo n.º 26
0
def NAQANet(context, question):
    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/naqanet-2020.02.19.tar.gz"
    )
    answer = predictor.predict(passage=context,
                               question=question)["answer"]["value"]
    return answer
Ejemplo n.º 27
0
def main():
    args = parse_args()

    predictor = Predictor.from_path(args.model_path)

    data = [('Restaurants_Train_v2.xml', 'Restaurants_Test_Gold.xml'),
            ('Laptop_Train_v2.xml', 'Laptops_Test_Gold.xml')]
    for train_file, test_file in data:
        # xml -> txt
        xml2txt(os.path.join(args.data_path, train_file))
        xml2txt(os.path.join(args.data_path, test_file))

        # txt -> json
        train_sentences = get_dependencies(
            os.path.join(args.data_path,
                         train_file.replace('.xml', '_text.txt')), predictor)
        test_sentences = get_dependencies(
            os.path.join(args.data_path,
                         test_file.replace('.xml', '_text.txt')), predictor)

        print(len(train_sentences), len(test_sentences))

        syntaxInfo2json(train_sentences,
                        os.path.join(args.data_path, train_file))
        syntaxInfo2json(test_sentences, os.path.join(args.data_path,
                                                     test_file))
Ejemplo n.º 28
0
def loadCorefPredictor():
    print("loading predictor...")
    predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz"
    )
    print("successfully loaded the predictor")
    return predictor
Ejemplo n.º 29
0
def main():

    filename = "../data/result.tsv"
    predictor = Predictor.from_path(
        "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz"
    )
    write(filename, predictor)
Ejemplo n.º 30
0
def extract_org_allenNLP(join_sentences, def_tag):

    predictor = Predictor.from_path(
        "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz"
    )
    pre = predictor.predict(sentence=join_sentences)

    _persons_ = [''] * len(pre['tags'])
    pre_tag = pre['tags']
    pre_words = pre['words']
    for i in range(len(pre_tag)):
        tag = pre_tag[i]
        if def_tag in tag:
            _persons_[i] = pre_words[i]

    _persons_ = [list(g) for k, g in groupby(_persons_, key=bool) if k]
    names = []
    for i in range(len(_persons_)):
        names.append(' '.join(_persons_[i]))

    # Remove duplicate names and considering the lone names as well
    temp = []
    for i in range(len(names)):
        if names[i] not in temp:
            temp.append(names[i])
    names = temp
    # Remove duplicate names irrespective of the case
    wordset = set(names)
    names = [
        item for item in wordset
        if item.istitle() or item.title() not in wordset
    ]

    return names