def __init__(self, name: str = None, custom_nlp: type = None) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="Text extractor", name=name if name else "Sentence extractor") load_parser = False if custom_nlp: try: custom_pipeline = copy.deepcopy(custom_nlp) pipe_names = custom_pipeline.pipe_names for pipe in pipe_names: if pipe != "parser": custom_pipeline.remove_pipe(pipe) try: assert "parser" in custom_pipeline.pipe_names self._parser = custom_pipeline except AssertionError: print("Note: custom_pipeline does not have a parser. \n" "Loading parser from en_core_web_sm... ") load_parser = True except AttributeError as e: print("Note: custom_pipeline does not have expected " "attributes.") print(e) print("Loading parser from en_core_web_sm...") load_parser = True else: load_parser = True if load_parser: self._parser = spacy.load("en_core_web_sm", disable=["tagger", "ner"])
def __init__(self, nlp, rules: Dict, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp rules: Dict extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="spacy_rule_extractor", name=extractor_name) self.rules = rules["rules"] self.nlp = copy.deepcopy(nlp) self.tokenizer = Tokenizer(self.nlp) self.matcher = Matcher(self.nlp.vocab) self.field_name = rules["field_name"] self.rule_lst = [] for a_rule in self.rules: this_rule = Rule(a_rule, self.nlp) self.rule_lst.append(this_rule)
def __init__(self, ignore_future_dates: bool=True, ignore_past_years: int=40) -> None: self.ignore_future_dates = ignore_future_dates self.ignore_past_years = ignore_past_years Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name="date parser")
def __init__(self, nlp, rules: Dict, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp rules (Dict): spacy rules extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="spacy_rule_extractor", name=extractor_name) self.rules = rules["rules"] self.nlp = copy.deepcopy(nlp) self.tokenizer = Tokenizer(self.nlp) self.matcher = Matcher(self.nlp.vocab) self.field_name = rules[ "field_name"] if "field_name" in rules else extractor_name self.rule_lst = {} self.hash_map = {} for idx, a_rule in enumerate(self.rules): this_rule = Rule(a_rule, self.nlp) self.rule_lst[this_rule.identifier + "rule_id##" + str(idx)] = this_rule
def __init__(self, etk: ETK = None, extractor_name: str = 'excel extractor') -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name=extractor_name) self.etk = etk
def __init__(self, extractor_name: str, search_url: str, get_attr=False, get_attr_url="http://dbpedia.org/sparql"): Extractor.__init__(self, input_type=InputType.TEXT, category="built_in_extractor", name=extractor_name) self._search_url = search_url self._get_attr = get_attr self._get_attr_url = get_attr_url
def __init__(self, glossary: List[str], extractor_name: str, tokenizer: Tokenizer, ngrams: int=2, case_sensitive=False) -> None: Extractor.__init__(self, input_type=InputType.TOKENS, category="glossary", name=extractor_name) self.ngrams = ngrams self.case_sensitive = case_sensitive self.default_tokenizer = tokenizer self.joiner = " " self.glossary = self.populate_trie(glossary)
def __init__(self, etk: ETK=None, extractor_name: str='date extractor') -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name=extractor_name) # The 'final_regex' and 'symbol_list' are generated by 'DateRegexGenerator' # If the single regexes are changed or more patterns are added, # please re-generate 'final_regex' and 'symbol_list' and paste here. d = DateRegexGenerator(singleton_regex, units) self._final_regex = d.final_regex self._symbol_list = d.symbol_list self._settings = {} self._last_original_resolution = None self._etk = etk self._lan = 'en'
def __init__(self): e_name = 'cryptographic hash extractor' self._regex_extractors = [ RegexExtractor(r"(\b[a-fA-F\d]{32}\b)", 'md5 ' + e_name, general_tag='md5'), RegexExtractor(r"(\b[0-9a-f]{40}\b)", 'sha1 ' + e_name, general_tag='sha1'), RegexExtractor(r"(\b[A-Fa-f0-9]{64}\b)", 'sha256 ' + e_name, general_tag='sha256'), ] Extractor.__init__(self, input_type=InputType.TEXT, category="regex", name=e_name)
def __init__(self, nlp, tokenizer, extractor_name: str) -> None: """ Initialize the extractor, storing the rule information and construct spacy rules Args: nlp: tokenizer: Tokenizer extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="build_in_extractor", name=extractor_name) self._nlp = copy.deepcopy(nlp) self._like_email_matcher = Matcher(self._nlp.vocab) self._tokenizer = tokenizer
def __init__( self, decoding_dict: dict, extractor_name: str, default_action: str = 'delete', case_sensitive: bool = False, strip_key: bool = True, strip_value: bool = False, ) -> None: """ Args: decoding_dict: dict -> a python dictionary for decoding values extractor_name: str -> extractor name default_action: enum['delete'] -> what if the value not matched in dictionary case_sensitive: bool -> matching the key and value strictly or ignore cases strip_key: bool -> strip key and value for matching or not strip_value: bool -> return the striped value if matched or the original value """ Extractor.__init__(self, input_type=InputType.TEXT, category="dictionary", name=extractor_name) if case_sensitive and not strip_key: self.decoding_dict = decoding_dict else: new_dict = {} if not strip_key: # not case_sensitive, ignore cases for k in decoding_dict: new_dict[k.lower()] = decoding_dict[k] elif case_sensitive: # strip key for k in decoding_dict: new_dict[k.strip()] = decoding_dict[k] else: # ignore case AND strip key for k in decoding_dict: new_dict[k.lower().strip()] = decoding_dict[k] self.decoding_dict = new_dict self.case_sensitive = case_sensitive self.default_action = default_action self.strip_key = strip_key self.strip_value = strip_value self.joiner = " "
def __init__(self, email_url: str, mailing_list_name: str, extractor_name: str) -> None: """ Initialize the extractor, storing mailing list and message information Args: email_url: str mailing_list_name: str extractor_name: str Returns: """ Extractor.__init__(self, input_type=InputType.TEXT, category="build_in_extractor", name=extractor_name) self.email_url = email_url self.mailing_list_name = mailing_list_name
def __init__(self, pattern: str, extractor_name: str = 'regex extractor', flags=0, general_tag: str = None) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="regex", name=extractor_name) self._compiled_regex = re.compile(pattern, flags) self._general_tag = general_tag self._match_functions = { MatchMode.MATCH: self._compiled_regex.match, MatchMode.SEARCH: self._compiled_regex.search, MatchMode.FINDALL: self._compiled_regex.finditer, MatchMode.SPLIT: self._compiled_regex.split }
def __init__(self, glossary: List[str], extractor_name: str, tokenizer: Tokenizer, ngrams: int = 2, case_sensitive=False) -> None: Extractor.__init__(self, input_type=InputType.TOKENS, category="glossary", name=extractor_name) self._case_sensitive = case_sensitive self._default_tokenizer = tokenizer if not ngrams: ngrams = 0 for word in glossary: ngrams = max(ngrams, len(self._default_tokenizer.tokenize(word))) self._ngrams = min(ngrams, 5) self._joiner = " " self._glossary = self._populate_trie(glossary)
def __init__(self, extractor_name: str, tokenizer: None, ngrams: int = 2, case_sensitive=False, redis_host="localhost", redis_port=6379, redis_key_prefix="") -> None: # if we set tokenizer as None, extractor will use regex to extract tokens to expedite the extraction Extractor.__init__(self, input_type=InputType.TOKENS, category="glossary", name=extractor_name) self._case_sensitive = case_sensitive self._default_tokenizer = tokenizer self._ngrams = min(ngrams, 5) self._joiner = " " self._redisconn = redis.StrictRedis(host=redis_host, port=int(redis_port), decode_responses=True) self._key_prefix = redis_key_prefix
def __init__(self, extractor_name: str, nlp=spacy.load('en_core_web_sm')): Extractor.__init__(self, input_type=InputType.TEXT, category="built_in_extractor", name=extractor_name) self.__nlp = nlp
def __init__(self, extractor_name: str) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="data extractor", name=extractor_name)
def __init__(self, rule_set: InferlinkRuleSet): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="Inferlink extractor") self.rule_set = rule_set
def __init__(self): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="HTML metadata extractor") """
def __init__(self): Extractor.__init__(self, input_type=InputType.HTML, category="HTML extractor", name="HTML content extractor")
def __init__(self, extractor_name: str, search_url: str): Extractor.__init__(self, input_type=InputType.TEXT, category="built_in_extractor", name=extractor_name) self.search_url = search_url
def __init__(self): Extractor.__init__(self, input_type=InputType.TEXT, category="Text extractor", name="Language Identification")
def __init__(self) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="content", name=TableExtractor.extractor_name)
def __init__(self) -> None: Extractor.__init__(self, input_type=InputType.OBJECT, category="data", name=EntityTableDataExtraction.extractor_name) self.glossaries = dict()
def __init__(self) -> None: Extractor.__init__(self, input_type=InputType.TEXT, category="content", name="DigTableExtractor") self.tableExtractorInstance = TableExtraction()