def __init__(self, synonyms_collection_filepath, frames_collection_filepath, init_ner=True, init_stemmer=True, init_frames=True, use_ner_cache_only=False, ner_name=supported.ONTONOTES_BERT_MULT_NAME): assert (isinstance(init_ner, bool)) assert (isinstance(init_frames, bool)) assert (isinstance(init_stemmer, bool)) assert (isinstance(ner_name, str)) self.__auth_objects = None self.__use_ner_cache_only = use_ner_cache_only self.__synonyms = None self.__stemmer = None self.__frame_variants = None self.__frames = None self.__pos_tagger = None self.__syntax = None self.__use_auth_list = False self.__frames_cache = None # NER self.__ner_cache = None self.__ner_class_type = Default.get_class_by_ner_name(ner_name) self.__ner = None if init_stemmer: self.__stemmer = Default.create_default_stemmer() if self.__stemmer is not None: self.__pos_tagger = POSMystemWrapper(self.__stemmer.MystemInstance) if init_frames: self.__frames = Default.create_default_frames_collection( frames_collection_filepath) if self.__stemmer is not None and self.__frames is not None: self.__frame_variants = Default.create_default_frame_variants_collection( frames=self.__frames, stemmer=self.__stemmer) if self.__frame_variants is not None: self.__frames_helper = FramesHelper(self.__frame_variants) if init_ner and not use_ner_cache_only: self.__ner = self.__ner_class_type() self.__synonyms = Default.create_default_synonyms_collection( filepath=synonyms_collection_filepath, stemmer=None if self.DISABLE_LEMMA_FOR_SYNONYMS else self.__stemmer) self.__auth_objects = AuthorizedObjectsCollection(OrderedDict())
ner_type = NerTypeArg.read_argument(args) ner_cache_filepath = NerCacheFilepathArg.read_argument(args) output_dir = OutputDirArg.read_argument(args) source_dir = NewsSourceDirArg.read_argument(args) reader = SourceNewsReaderArg.read_argument(args) # Initializing ner cache. ner_cache = SQLiteNERCacheData.init_as_read_only(ner_cache_filepath) # Exporting results. news_processed = 0 added_words = set() f_name = "{}.txt".format(ner_type) # Init obj values extractor. ner_class_type = Default.get_class_by_ner_name(ner_type) text_object_authorizer = TextObjectAuthorizer(ner_type=ner_class_type) obj_values_extractor = TextObjectValuesExtractor( ner_cache=ner_cache, stemmer=Default.create_default_stemmer(), default_auth_check=lambda text_obj: text_object_authorizer.is_auth( text_obj)) create_dir(output_dir) print("Output dir: {}".format(output_dir)) with ner_cache: with open(join(output_dir, f_name), "w") as f: for _, news_info in reader.get_news_iter(source_dir): assert (isinstance(news_info, NewsInfo))