Ejemplo n.º 1
0
    def hist_tokens_texts(cls, texts_dir, vectorizer, ext='txt'):
        reader = CorpusReader(input_folder_name=texts_dir,
                              doc_pattern=r'(.*?/).*\.' + ext,
                              categ_pattern=r'(.*?)/.*\.' + ext,
                              encoding='utf-8')
        texts = list(reader.readfiles(fileids=reader.root_ids))

        docs = vectorizer.fit_transform(texts)

        features = vectorizer.get_feature_names()

        visualizer = FreqDistVisualizer(features=features, size=(1080, 720))
        visualizer.fit(docs)
        visualizer.show()
Ejemplo n.º 2
0
 def start_stem_files_sync(self):
     reader = CorpusReader(input_folder_name=self.stem_temp_dir_name,\
                           doc_pattern=r'(.*?/).*\.txt', categ_pattern=r'(.*?)/.*\.txt',
                           encoding=self.encoding)
     file_ids = [
         os.path.join(self.stem_dir_name + '\\' + self.temp_dir_name, item)
         for item in reader.root_ids
     ]
     for file in file_ids:
         self.on_result(self.stem_file(file, self.encoding))
Ejemplo n.º 3
0
def corpus_info(file_id,
                files_encoding,
                ignore_digits=True,
                ignore_punct=True):

    path_to_load = os.path.dirname(file_id)
    file_id = os.path.basename(file_id)
    reader = CorpusReader(input_folder_name=path_to_load,\
                          doc_pattern=r'(.*?/).*\.txt', categ_pattern=r'(.*?)/.*\.txt',
                          encoding=files_encoding)

    return {
        'paras_list':
        [item for item in list(reader.paras(fileids=[file_id])) if item != ''],
        'sents_list':
        list(reader.sents(fileids=[file_id])),
        'words_list':
        list(
            reader.words(fileids=[file_id],
                         ignore_digits=ignore_digits,
                         ignore_punct=ignore_punct)),
    }
Ejemplo n.º 4
0
    def start_ngr_calc(self,
                       file_id,
                       files_encoding,
                       load_new_top_ngrms='',
                       filter_ngr_len=0):
        path_to_load = os.path.dirname(file_id)
        reader = CorpusReader(input_folder_name=path_to_load,\
                              doc_pattern=r'(.*?/).*\.txt', categ_pattern=r'(.*?)/.*\.txt',
                              encoding=files_encoding)

        ngram_preserver = NgramPreserver(self.ngr_num)
        return ngram_preserver.count_ngrams_file(reader, file_id,
                                                 load_new_top_ngrms,
                                                 filter_ngr_len)
Ejemplo n.º 5
0
    def stem(self, path_to_dir, encoding_files, flag_async):

        #path_to_dir = os.path.dirname(dirname)
        reader = CorpusReader(input_folder_name=path_to_dir,
                              doc_pattern=r'(.*?/).*\.txt',
                              categ_pattern=r'(.*?)/.*\.txt',
                              encoding=encoding_files)

        #fileids = [os.path.join(path_to_dir,item) for item in reader.root_ids]
        fileids = reader.root_ids
        stef = StemmerFiles(path_to_dir, fileids, encoding=encoding_files,nltk_stop_lang = None, nltk_stemmer_lang = 'russian',
                            dict_stem_file = '../general_modules/словари/морфологический словарь.txt',
                            stop_words_files = ['../general_modules/словари/delete_sym.txt'], min_token_len = None,\
                            results_dir_name=self.results_dir_name, stem_dir_name=self.stem_dir_name,\
                            temp_dir_name=self.temp_dir_name, save_sents=False)

        #stef=StemmerFiles(path_to_dir,fileids,encoding='utf8')
        self.logger.info('-----------\nstart stemming {} file pieces'.format(
            len(fileids)))
        stef.start_stem_files(flag_async=flag_async)
Ejemplo n.º 6
0
    def load_new_dir_sync(self,
                          path_to_load,
                          files_encoding,
                          load_new_top_ngrms='',
                          filter_ngr_len=0):
        results = []
        reader = CorpusReader(input_folder_name=path_to_load,\
                              doc_pattern=r'(.*?/).*\.txt', categ_pattern=r'(.*?)/.*\.txt',
                              encoding=files_encoding)

        file_ids = [
            os.path.join(path_to_load, item) for item in reader.root_ids
        ]
        for file_id in file_ids:
            results.append(
                (self.start_ngr_calc(files_encoding=files_encoding,
                                     file_id=file_id,
                                     load_new_top_ngrms=load_new_top_ngrms,
                                     filter_ngr_len=filter_ngr_len)))
        return results
Ejemplo n.º 7
0
    def multi_files_operator(self, func_oper, path_to_load, files_encoding,
                             **kwargs):
        futures = set()
        call_back_args = None
        reader = CorpusReader(input_folder_name=path_to_load,\
                              doc_pattern=r'(.*?/).*\.txt', categ_pattern=r'(.*?)/.*\.txt',
                              encoding=files_encoding)

        file_ids = [
            os.path.join(path_to_load, item) for item in reader.root_ids
        ]
        with concurrent.futures.ProcessPoolExecutor(
                max_workers=self.process_num) as executor:
            if 'callback' in kwargs:
                call_back_args = kwargs['callback']
                del kwargs['callback']
            for file in file_ids:
                future = executor.submit(func_oper, file, files_encoding,
                                         **kwargs)
                futures.add(future)

            results = self.wait_for(futures, call_back_args)

        return results
Ejemplo n.º 8
0
                        self.stop_words_files,
                        self.dict_stem_file,
                        save_sents=self.save_sents) + '\n')
        if not os.path.exists(os.path.dirname(file_name_out)):
            os.mkdir(os.path.dirname(file_name_out))

        with open(file_name_out, 'wt', encoding=file_encoding) as f_out:
            f_out.writelines(write_lines)

        return file_name


if __name__ == '__main__':

    INPUT_FOLDER = 'input_texts/temp3'
    #INPUT_FOLDER='input_texts'
    #INPUT_FOLDER = 'input_texts2'

    reader = CorpusReader(input_folder_name=INPUT_FOLDER,
                          doc_pattern=r'(.*?/).*\.txt',
                          categ_pattern=r'(.*?)/.*\.txt',
                          encoding='utf8')

    fileids = reader.root_ids
    #fileids = reader._resolve(categories=['temp3'])
    stef = StemmerFiles(INPUT_FOLDER, fileids, encoding='utf8')

    stef.stem_file('input_texts/temp3/export 2019-12-16 1175.txt', 'utf-8')

    # stef.start_stem_files(flag_async=False)