Esempio n. 1
0
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        sample_rules = self.etk.load_spacy_rule(
            "./extraction_modules/resources/sample_rules.json")

        self.sample_rule_extractor = SpacyRuleExtractor(
            self.etk.default_nlp, sample_rules, "test_extractor")
Esempio n. 2
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'gtd_date_parser')
     self.causeex_decoder = DecodingValueExtractor(
         event_to_clauseex_class_mapping,
         'CauseEx Type',
         default_action="delete")
Esempio n. 3
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.name_extractor = GlossaryExtractor(
         self.etk.load_glossary("./names.txt"),
         "name_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=1)
Esempio n. 4
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.mapping = GdeltMapping(json.load(open("ODP-Mappings-V3.1.json")))
     # As our input files have no header, create a translation table to go from names to indices.
     for i in range(0, len(self.header_fields)):
         self.header_translation_table[
             self.header_fields[i]] = "COL" + str(i)
     # Extractors
     self.date_extractor = DateExtractor(self.etk, "Date Extractor")
Esempio n. 5
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     bae = BitcoinAddressExtractor()
     ce = CVEExtractor()
     che = CryptographicHashExtractor()
     he = HostnameExtractor()
     iae = IPAddressExtractor()
     ue = URLExtractor(True)
     self.e_list = [bae, ce, che, he, iae, ue]
Esempio n. 6
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.actor_type_decoder = DecodingValueExtractor(
         self.actor_codes, 'Actor Code Decoder')
     self.known_group_decoder = DecodingValueExtractor(
         self.known_group_codes, 'Known Groups Decoder')
     self.ethnic_group_decoder = DecodingValueExtractor(
         self.known_group_codes, 'Ethnic Groups Decoder')
     self.religion_decoder = DecodingValueExtractor(self.religion_codes,
                                                    'Religion Decoder')
     self.country_decoder = DecodingValueExtractor(self.country_codes,
                                                   'Country Decoder')
Esempio n. 7
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'ifp_date_parser')
     ifp_list = open('new_ifps.jl').readlines()
     self.new_ifps = dict()
     for ifp in ifp_list:
         j = json.loads(ifp)
         self.new_ifps[j['ifp']['id']] = j['ifp']['name']
     self.parsed_ifps = dict()
     # self.ifps_entity_map = dict()
     self.threshold = 0.86
     self.nlp = spacy.load('en_core_web_lg')
     self.preprocess_ifps()
     self.ranking_criteria = 'SENTENCE'
Esempio n. 8
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.doc_selector = DefaultDocumentSelector()
     self.incomp_decoder = DecodingValueExtractor(self.incomp_type, 'Incomp Decoder')
     self.int_decoder = DecodingValueExtractor(self.int_event_type, 'Int Decoder')
     self.int_fatalities_decoder = DecodingValueExtractor(self.int_fatalities, 'Int Fatalities Decoder')
     self.int_fatalities_size_lower_decoder = DecodingValueExtractor(self.int_fatalities_size_lower,
                                                                     'Int Fatalities Lower Bound Size Decoder')
     self.int_fatalities_size_upper_decoder = DecodingValueExtractor(self.int_fatalities_size_upper,
                                                                     'Int Fatalities Upper Bound Size Decoder',
                                                                     default_action="delete")
     self.int_causeex_decoder = DecodingValueExtractor(self.int_causeex_type,
                                                       'Int CauseEx Type',
                                                       default_action="delete")
Esempio n. 9
0
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.name_extractor = GlossaryExtractor(
            self.etk.load_glossary("./extraction_modules/resources/names.txt"),
            "name_extractor",
            self.etk.default_tokenizer,
            case_sensitive=False,
            ngrams=1)

        self.student_extractor = GlossaryExtractor(self.etk.load_glossary(
            "./extraction_modules/resources/student.txt"),
                                                   "student_extractor",
                                                   self.etk.default_tokenizer,
                                                   case_sensitive=False,
                                                   ngrams=1)
Esempio n. 10
0
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.my_table_extractor = TableExtractor()
        self.etk.parser = jex.parse
        file_name = '${GLOSSARY_PATH}/cities_ppl_25000.json'
        file = open(file_name, 'r')
        self.city_dataset = json.loads(file.read())
        file.close()
        self.city_list = list(self.city_dataset.keys())

        self.my_glossary_extractor = GlossaryExtractor(
            glossary=self.city_list,
            extractor_name='tutorial_glossary',
            tokenizer=etk.default_tokenizer,
            ngrams=3,
            case_sensitive=False)
Esempio n. 11
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.metadata_extractor = HTMLMetadataExtractor()
     self.content_extractor = HTMLContentExtractor()
     self.date_extractor = DateExtractor(self.etk, 'demo_date_parser')
     self.country_extractor = GlossaryExtractor(
         self.etk.load_glossary("${GLOSSARY_PATH}/countries.txt"),
         "country_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
     self.cities_extractor = GlossaryExtractor(
         self.etk.load_glossary("${GLOSSARY_PATH}/cities.txt"),
         "cities_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
Esempio n. 12
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
Esempio n. 13
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.weapon_decoder = DecodingValueExtractor(
         weapons_to_clauseex_class_mapping,
         'Causeex Weapon Type',
         default_action='delete')
Esempio n. 14
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.rule_extractor = SpacyRuleExtractor(
         self.etk.default_nlp,
         self.etk.load_spacy_rule("sample_rules.json"), "test_extractor")
Esempio n. 15
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.inferlink_extractor = InferlinkExtractor(
         InferlinkRuleSet(
             InferlinkRuleSet.load_rules_file(
                 '../html_basic/sample_inferlink_rules.json')))
Esempio n. 16
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.uri_prefix = "http://spaceaware.isi.edu/data"
Esempio n. 17
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.metadata_extractor = HTMLMetadataExtractor()
     self.content_extractor = HTMLContentExtractor()
 def __init__(self, etk: ETK):
     ETKModule.__init__(self, etk)
     self.sentence_extractor = SentenceExtractor(
         name="My sentence splitter")
Esempio n. 19
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')
     self.country_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/countries.json.gz", read_json=True),
                                                "country_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=3)
     self.states_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True),
                                               "states_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.cities_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/cities.json.gz", read_json=True),
                                               "cities_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.csv_processor = CsvProcessor(etk=etk, heading_row=1)
     self.interaction_decoding_dict = {
         "10": "Sole Military Action",
         "11": "Military Versus Military",
         "12": "Military Versus Rebels",
         "13": "Military Versus Political Militia",
         "14": "Military Versus Communal Militia",
         "15": "Military Versus Rioters",
         "16": "Military Versus Protesters",
         "17": "Military Versus Civilians",
         "18": "Military Versus Other",
         "20": "Sole Rebel Action",
         "22": "Rebels Versus Rebels",
         "23": "Rebels Versus Political Militia",
         "24": "Rebels Versus Communal Militia",
         "25": "Rebels Versus Rioters",
         "26": "Rebels Versus Protesters",
         "27": "Rebels Versus Civilians",
         "28": "Rebels Versus Other",
         "30": "Sole Political Militia Action",
         "33": "Political Militia Versus Political Militia",
         "34": "Political Militia Versus Communal Militia",
         "35": "Political Militia Versus Rioters",
         "36": "Political Militia Versus Protesters",
         "37": "Political Militia Versus Civilians",
         "38": "Political Militia Versus Other",
         "40": "Sole Communal Militia Action",
         "44": "Communal Militia Versus Communal Militia",
         "45": "Communal Militia Versus Rioters",
         "46": "Communal Militia Versus Protesters",
         "47": "Communal Militia Versus Civilians",
         "48": "Communal Militia Versus Other",
         "50": "Sole Rioter Action",
         "55": "Rioters Versus Rioters",
         "56": "Rioters Versus Protesters",
         "57": "Rioters Versus Civilians",
         "58": "Rioters Versus Other",
         "60": "Sole Protester Action",
         "66": "Protesters Versus Protesters",
         "68": "Protesters Versus Other",
         "78": "Other Actor Versus Civilians",
         "80": "Sole Other Action"
     }
     self.interaction_decoder = DecodingValueExtractor(
         self.interaction_decoding_dict,
         'default_decoding',
         case_sensitive=True)
Esempio n. 20
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.table_extractor = TableExtractor()
Esempio n. 21
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.ee = ExcelExtractor()