コード例 #1
0
    def __init__(self,
                 synonyms_collection_filepath,
                 frames_collection_filepath,
                 init_ner=True,
                 init_stemmer=True,
                 init_frames=True,
                 use_ner_cache_only=False,
                 ner_name=supported.ONTONOTES_BERT_MULT_NAME):
        assert (isinstance(init_ner, bool))
        assert (isinstance(init_frames, bool))
        assert (isinstance(init_stemmer, bool))
        assert (isinstance(ner_name, str))

        self.__auth_objects = None
        self.__use_ner_cache_only = use_ner_cache_only
        self.__synonyms = None
        self.__stemmer = None
        self.__frame_variants = None
        self.__frames = None
        self.__pos_tagger = None
        self.__syntax = None
        self.__use_auth_list = False
        self.__frames_cache = None

        # NER
        self.__ner_cache = None
        self.__ner_class_type = Default.get_class_by_ner_name(ner_name)
        self.__ner = None

        if init_stemmer:
            self.__stemmer = Default.create_default_stemmer()

        if self.__stemmer is not None:
            self.__pos_tagger = POSMystemWrapper(self.__stemmer.MystemInstance)

        if init_frames:
            self.__frames = Default.create_default_frames_collection(
                frames_collection_filepath)

        if self.__stemmer is not None and self.__frames is not None:
            self.__frame_variants = Default.create_default_frame_variants_collection(
                frames=self.__frames, stemmer=self.__stemmer)

        if self.__frame_variants is not None:
            self.__frames_helper = FramesHelper(self.__frame_variants)

        if init_ner and not use_ner_cache_only:
            self.__ner = self.__ner_class_type()

        self.__synonyms = Default.create_default_synonyms_collection(
            filepath=synonyms_collection_filepath,
            stemmer=None
            if self.DISABLE_LEMMA_FOR_SYNONYMS else self.__stemmer)

        self.__auth_objects = AuthorizedObjectsCollection(OrderedDict())
コード例 #2
0
    def __init__(self,
                 settings,
                 contexts_printer,
                 opinion_statistic_printer,
                 object_statistic_printer,
                 parse_frames_in_news_sentences):
        assert(isinstance(settings, Settings))
        assert(isinstance(contexts_printer, ContextsPrinter))
        assert(isinstance(parse_frames_in_news_sentences, bool))
        assert(isinstance(opinion_statistic_printer, OpinionStatisticBasePrinter))
        assert(isinstance(object_statistic_printer, StatisticObjectsPrinter) or object_statistic_printer is None)
        self.__settings = settings
        self.__context_printer = contexts_printer
        self.__opinion_statistic_printer = opinion_statistic_printer
        self.__object_statistic_printer = object_statistic_printer
        self.__parse_frames_in_news_sentences = parse_frames_in_news_sentences
        self.__check_obj_preposition_in_title = True

        self.__text_object_authorizer = TextObjectAuthorizer(ner_type=settings.NERClassType)

        self.__ner_extractor = Default.create_ner_extractor(
            ner=settings.NER,
            ner_cache=settings.NerCache,
            default_auth_check=lambda text_obj: self.__text_object_authorizer.is_auth(text_obj))

        self.__debug_opinions_created = 0
        self.__debug_opinions_with_missed_synonyms = 0
        self.__debug_opinions_looped = 0
        self.__debug_opinions_total_extracted_from_titles = 0
        self.__debug_opinions_rejected_by_preps = 0
        self.__debug_opinions_title_synonymous_existed = 0
コード例 #3
0
    def __init__(self, ner_cache, stemmer, default_auth_check):
        assert(isinstance(stemmer, Stemmer))

        self.__stemmer = stemmer
        self.__ner_extractor = Default.create_ner_extractor(
            ner=None,
            ner_cache=ner_cache,
            default_auth_check=default_auth_check)
    def expand_with_neutral(self, from_zip_filepath, cache_dir, log_filepath,
                            used_locations_filepath, neut_opin_stat_filepath):
        assert (isinstance(from_zip_filepath, str))
        assert (isinstance(cache_dir, str))
        assert (isinstance(log_filepath, str))
        assert (isinstance(used_locations_filepath, str))
        assert (isinstance(neut_opin_stat_filepath, str))

        # Clearing folder and filling with fresh data.
        os.system('rm -rf {cache_dir}'.format(cache_dir=cache_dir))

        # Extract everything into cache_dir.
        with zipfile.ZipFile(from_zip_filepath, 'r') as zip_input:
            zip_input.extractall(cache_dir)

        # Reading synonyms collection.
        print("Reading synonyms collection ...")
        stemmer = Default.create_default_stemmer()
        synonyms_filepath = join(cache_dir, "synonyms.txt")
        synonyms = SynonymsCollection.from_file(synonyms_filepath,
                                                stemmer=stemmer)

        # Initialize all the related from synonyms collection information.
        self.__init_from_synonyms(synonyms)

        # Start neutral opinion annotation process.
        print("Run processing ...")
        source_filepath = join(cache_dir, "collection.txt")
        target_filepath = join(cache_dir, "collection-neut.txt")
        with open(source_filepath, 'r') as f_src:
            with open(target_filepath, 'w') as f_to:
                neut_logger = self.__process(f_src=f_src, f_to=f_to)

        # Replacing old file in cache dir with a new one.
        os.system('mv {} {}'.format(target_filepath, source_filepath))

        # Saving everything in a new archive file.
        target_zip_filepath = join(dirname(from_zip_filepath),
                                   get_target_filename(from_zip_filepath))

        if neut_logger is not None:
            with open(log_filepath, 'w') as f:
                for line in neut_logger.iter_data():
                    f.write(line)

        with open(used_locations_filepath, 'w') as f:
            for key, value in sorted(self.__used_locations.items(),
                                     key=lambda pair: pair[1]):
                f.write("'{entry}': {count}\n".format(entry=key, count=value))

        self.__opin_stat_printer.print_statistic(neut_opin_stat_filepath)

        archive_all_files_in_zip(to_zip_filepath=target_zip_filepath,
                                 source_dir=cache_dir)
コード例 #5
0
    NewsSourceDirArg.add_argument(parser)
    SourceNewsReaderArg.add_argument(parser)
    NewsStartFromIndexArg.add_argument(parser)
    ParseFramesInSentencesArgs.add_argument(parser)
    RuSentiFramesCacheArgs.add_argument(parser)

    # Parsing arguments.
    args = parser.parse_args()

    # Reading arguments.
    src_dir = NewsSourceDirArg.read_argument(args)
    reader = SourceNewsReaderArg.read_argument(args)
    frames_filepath = RuSentiFramesCacheArgs.read_argument(args)
    parse_frames_in_sents = ParseFramesInSentencesArgs.read_argument(args)
    start_from_index = NewsStartFromIndexArg.read_argument(args)

    stemmer = Default.create_default_stemmer()
    frames = Default.create_default_frames_collection(frames_filepath)
    f_var = Default.create_default_frame_variants_collection(frames=frames,
                                                             stemmer=stemmer)

    run_frames_cache(reader=reader,
                     src_dir=src_dir,
                     version=basename(frames_filepath),
                     frames=frames,
                     frames_helper=FramesHelper(f_var),
                     stemmer=stemmer,
                     parse_frames_in_sentences=parse_frames_in_sents,
                     start_from_index=start_from_index,
                     miniter_count=2000000)
    ner_type = NerTypeArg.read_argument(args)
    ner_cache_filepath = NerCacheFilepathArg.read_argument(args)
    output_dir = OutputDirArg.read_argument(args)
    source_dir = NewsSourceDirArg.read_argument(args)
    reader = SourceNewsReaderArg.read_argument(args)

    # Initializing ner cache.
    ner_cache = SQLiteNERCacheData.init_as_read_only(ner_cache_filepath)

    # Exporting results.
    news_processed = 0
    added_words = set()
    f_name = "{}.txt".format(ner_type)

    # Init obj values extractor.
    ner_class_type = Default.get_class_by_ner_name(ner_type)
    text_object_authorizer = TextObjectAuthorizer(ner_type=ner_class_type)
    obj_values_extractor = TextObjectValuesExtractor(
        ner_cache=ner_cache,
        stemmer=Default.create_default_stemmer(),
        default_auth_check=lambda text_obj: text_object_authorizer.is_auth(
            text_obj))

    create_dir(output_dir)
    print("Output dir: {}".format(output_dir))

    with ner_cache:
        with open(join(output_dir, f_name), "w") as f:
            for _, news_info in reader.get_news_iter(source_dir):
                assert (isinstance(news_info, NewsInfo))
コード例 #7
0
                        nargs='?',
                        help='Source directory')

    # Added parameters.
    SynonymsCollectionFilepathArg.add_argument(parser)

    # Parsing arguments.
    args = parser.parse_args()

    # Readed parameters.
    opinion_filepath = args.opinion_filepath
    source_filepath = args.source_filepath
    synonyms_filepath = SynonymsCollectionFilepathArg.read_argument(args)
    opinion_filename = basename(opinion_filepath)

    stemmer = Default.create_default_stemmer()
    synonyms = Default.create_default_synonyms_collection(
        filepath=synonyms_filepath, stemmer=stemmer)

    with open(opinion_filepath, 'r') as f:

        opinions = read_opinions(
            filepath=opinion_filepath,
            synonyms=synonyms,
            custom_opin_ends_iter=lambda use_sentiment:
            OpinionStatisticBasePrinter.iter_opinion_end_values(
                f=f, read_sentiment=use_sentiment),
            read_sentiment=False)

    file_ids_it = iter_relevant_file_ids(source_filepath=source_filepath,
                                         opinions=opinions)
    parser.add_argument('--obj-values-dir',
                        dest='obj_values_dir',
                        type=str,
                        nargs=1,
                        help='Source dir')

    # Parse arguments.
    OptionalOutputDirArg.add_argument(parser)

    # Reading arguments.
    args = parser.parse_args()
    source_dir = args.obj_values_dir[0]
    output_dir = OptionalOutputDirArg.read_argument(args)

    # Initialize necessary instances for words grouping.
    stemmer = Default.create_default_stemmer()
    ruthes_nouns = RussianThesaurusSynsets.from_xml_file(filepath=args.ruthes_filepath[0])

    log_found_in_ruthes = 0
    log_lemmas_kept = 0
    syn_groups = {}

    # Processsing all the files in subdir.
    f_names_it = get_all_subfiles(data_folder=source_dir,
                                  f_name_check_rule=lambda _: True)
    for filename in f_names_it:
        print(filename)
        for obj_value, obj_type in iter_words_with_types_from_filepath(filename):

            if obj_value in ruthes_nouns:
                log_found_in_ruthes += 1