Beispiel #1
0
    def test_constituency_parsing(self):
        predictor = pretrained.span_based_constituency_parsing_with_elmo_joshi_2018()

        sentence = """Pierre Vinken died aged 81; immortalised aged 61."""

        result = predictor.predict_json({"sentence": sentence})

        assert result["tokens"] == ["Pierre", "Vinken", "died", "aged", "81", ";", "immortalised", "aged", "61", "."]
        assert result["trees"] == "(S (NP (NNP Pierre) (NNP Vinken)) (VP (VP (VBD died) (NP (JJ aged) (CD 81))) (, ;) (VP (VBD immortalised) (S (ADJP (VBN aged) (NP (CD 61)))))) (. .))"
Beispiel #2
0
    def __init__(self, config):

        super().__init__()
        self.config = config
        self.data_dir = config.data_dir
        self.save_data_dir = config.save_data_dir
        # Heuristic weather conditions
        self.weather_list = ['rainy', 'sunny', 'daytime', 'day', 'night']
        self.difficult_pronouns = ["other", "it"]
        self.nlp = spacy.load('en_core_web_sm')
        # self.coref_model = pretrained.neural_coreference_resolution_lee_2017()
        # Constituency parser for getting ellipsis
        self.const_parser = pretrained.span_based_constituency_parsing_with_elmo_joshi_2018(
        )
        self.heuristic_root_cp = ["S", "SQ", "SBARQ", "SINV"]
Beispiel #3
0
    def __init__(self, config):

        super().__init__()
        self.config = config
        self.data_dir = config.data_dir
        self.save_data_dir = config.save_data_dir
        # Heuristic weather conditions
        self.weather_list = ['rainy', 'sunny', 'daytime', 'day', 'night']
        self.difficult_pronouns = ["other", "it"]

        if config.create_stats_dic:
            self.nlp = spacy.load('en_core_web_sm')
            # self.coref_model = pretrained.neural_coreference_resolution_lee_2017()
            # Constituency parser for getting ellipsis
            self.const_parser = pretrained.span_based_constituency_parsing_with_elmo_joshi_2018()
            self.heuristic_root_cp = ["S", "SQ", "SBARQ", "SINV"]

        self.non_agreement_images = [128889, 525402, 270717, 562489, 204402, 431203, 566063, 391624, 34733, 357189, 194408, 290919, 179535, 321201, 434695, 422086, 389671, 219511, 288804, 135635, 506795, 565265, 76825, 113894, 569745, 454202, 648, 188270, 62886, 67435, 301586, 274049, 139477, 372849, 418086, 520026, 358062, 24025, 531916, 129793, 165015, 491809, 58794, 77166, 177397, 130307, 469621, 249431, 133036, 542075, 240603, 330121, 526149, 472987, 456541, 498784, 216234, 30786, 51931, 512332, 31706, 220643, 39771, 170070, 386874, 100528, 231110, 154609, 139962, 145204, 567307, 174821, 468028, 114981, 284112, 406280, 528160, 185561, 313049, 306269, 325335, 332510, 546153, 389417, 489182, 174023, 95844, 116883, 452505, 53772, 87395, 290313, 225029, 439314, 494256, 84393, 118025, 417343, 57931, 190947, 509900, 60776, 235054, 307886, 544849, 70689, 51184, 225737, 127730, 227148, 240892, 167060, 49132, 300446, 467899, 123729, 402930, 567184, 185894, 333125, 323557, 1872, 573045, 353753, 142963, 268723, 327133, 161055, 185565, 574189, 29737, 99643, 295627, 485732, 546554, 286929, 175345, 223379, 146821, 358981, 427711, 212259, 347890, 297528, 83797, 369360, 538790, 219444, 101666, 201002]
        self.hist_info_images = [257366, 425477, 191097, 552399, 12468, 458949, 109735, 311793, 437200, 355853, 98849, 57743, 83289, 488471, 446567, 196905, 308846, 328336, 289233, 52156, 366462, 511748, 457675, 518811, 413085, 432039, 531270, 430580, 293582, 544148, 80366, 179366, 150236, 400960, 10424, 451398, 498340, 268914, 384171, 172461, 387266, 214227, 555578, 181772, 149373, 251385, 407878, 574545, 544827, 120559, 19299, 73638, 496822, 204195, 97073, 209447, 53433, 403234, 524006, 178300, 376460, 570468, 292100, 227006, 170315, 456824, 525726, 179064, 98879, 558975, 193521, 377823, 449230, 44468, 573552, 288308, 237956, 69538, 250654, 439842, 146314, 458818, 122826, 33976, 322815, 239030, 209271, 560666, 361734, 225491, 27366, 29060, 191186, 394073, 120870, 580183, 111013]
                    'The other dataset', row[0], 'GEOQA')
                res.append(geoaq)
                line_count += 1
        print(f'Processed {line_count} lines.')
        return res


PRONOUN = dict({
    'where': '1',
    'what': '2',
    'which': '3',
    'when': '4',
    'who': '6',
    'why': '7'
})
model = pretrained.span_based_constituency_parsing_with_elmo_joshi_2018()

fpt = 'data/place_type/type-set'
factv = 'data/verb/action_verb.txt'
fstav = 'data/verb/stative_verb.txt'

pt_set, pt_dict = load_pt(fpt)
actv = load_word(factv)
stav = load_word(fstav)

# loading ELMo pretrained word embedding model
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

elmo = Elmo(options_file, weight_file, 2, dropout=0)
Beispiel #5
0
        try:
            number = int(word)
        except ValueError:
            try:
                number = float(word)
            except ValueError:
                number = None
    return number


tokenizer = BertDropTokenizer(pretrained_model="bert-base-uncased")
number_tokenizer = WordTokenizer()
words_splitter = WordTokenizer()
sentences_splitter = SpacySentenceSplitter()
ner_tagger = fine_grained_named_entity_recognition_with_elmo_peters_2018()
pos_tagger = span_based_constituency_parsing_with_elmo_joshi_2018()


def extract_letters_frequency(passage, sentence_idx=None):
    """
    :param passage:
    :param sentence_idx: None for whole passage, else per sentence (index 0)..
    :return:
    """
    if sentence_idx is None:
        return dict(filter(lambda k: k[0].isalpha(), Counter(passage).items()))
    else:
        sentences = extract_sentences(passage)
        sen = sentences[sentence_idx]
        return dict(filter(lambda k: k[0].isalpha(), Counter(sen).items()))