Ejemplo n.º 1
0
 def stat(self):
     env = Environment()
     data = pd.DataFrame()
     file_stat = env.filename_results_csv()
     try:
         data = pd.read_csv(file_stat, index_col='idstat', encoding='utf-8')
     except:
         env.debug(1, ['Failed to read stat file:', file_stat])
     else:
         env.debug(1, ['Read stat file OK:', file_stat])
     #print(data)
     return data
Ejemplo n.º 2
0
 def tokenz(self):
     env = Environment()
     df_tokenz = pd.DataFrame()
     file_tokenz = env.filename_tokenz_csv()
     try:
         df_tokenz = pd.read_csv(file_tokenz,
                                 index_col='idcorpus',
                                 encoding='utf-8')
     except:
         env.debug(1, ['Failed to read tokenz file:', file_tokenz])
     else:
         env.debug(1, ['Read tokenz OK:', file_tokenz])
     return df_tokenz
Ejemplo n.º 3
0
 def authors(self, mode=0):
     env = Environment()
     df = pd.DataFrame()
     filename = env.filename_authors_csv()
     try:
         df = pd.read_csv(filename, index_col='idauthor', encoding='utf-8')
     except:
         env.debug(1, ['Failed to load authors CSV file', filename])
     else:
         env.debug(1, ['Load authors CSV file', filename])
     if mode == 1:
         return df.to_dict().get('name')
     else:
         return df
Ejemplo n.º 4
0
 def grammemes(self, mode=0):
     env = Environment()
     dfgram = pd.DataFrame()
     filename_gram = env.filename_grammemes_csv()
     try:
         dfgram = pd.read_csv(filename_gram,
                              index_col='idgram',
                              encoding='utf-8')
     except:
         env.debug(1, ['Failed to load grammemes CSV file', filename_gram])
     else:
         env.debug(1, ['Load grammemes CSV file', filename_gram])
     if mode == 1:
         return dfgram.to_dict().get('name')
     else:
         return dfgram
Ejemplo n.º 5
0
 def corpus_xml2txt(self, num=1, persistent=True):
     result = True
     env = Environment()
     file_xml = env.filename_corpus_xml(num)
     try:
         tree = ET.ElementTree(file=file_xml)
     except:
         env.debug(1, ['Failed to load XML:', file_xml])
         result = False
     else:
         file_txt = env.filename_corpus_txt(num)
         file = open(file_txt, mode='w')
         for elem in tree.iter('source'):
             # print(elem.text, elem.tag, elem.attrib)
             file.write(elem.text)
             file.write(' ')
         file.close()
         env.debug(1, ['Write corpus file to TXT:', file_txt])
     return result
Ejemplo n.º 6
0
 def test(self, n_min=1, n_max=1):
     t_start = timer()
     env = Environment()
     df_test = pd.DataFrame()
     for i in range(n_min, n_max + 1):
         try:
             dffile = pd.read_csv(env.filename_corpus_csv(i),
                                  index_col='idcorpus',
                                  encoding='utf-8')
         except:
             env.debug(1, [
                 'POStagger', 'test', 'Failed to read corpus file:',
                 env.filename_corpus_csv(i)
             ])
         else:
             env.debug(1, [
                 'POStagger', 'test', 'Read OK:',
                 env.filename_corpus_csv(i)
             ])
             if not dffile.empty:
                 df_test = df_test.append(dffile)
     df_test = df_test.drop_duplicates()
     df_test.columns = ['word', 'gram', 'idgram']
     df_test = df_test.reset_index(drop=True)
     df_test.index.name = 'idcorpus'
     df_test['gram_valid'] = df_test['gram']
     n_testsize = df_test.shape[0]
     env.debug(1, ['POStagger', 'test', 'START %s words' % n_testsize])
     df_test = self.pos(df_test)
     print('Test result', df_test)
     df_err = df_test[df_test['gram_valid'] != df_test['gram']]
     print('Test errors:', df_err)
     df_err.to_csv(env.filename_test_err_csv(), encoding='utf-8')
     env.debug(1, [
         'POStagger', 'test',
         'test accuracy %s' % (1 - df_err.shape[0] / n_testsize)
     ])
     t_end = timer()
     env.debug(1, [
         'POSTagger', 'test', 'test time:',
         env.job_time(t_start, t_end), 'sec.'
     ])
Ejemplo n.º 7
0
 def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True):
     env = Environment()
     df_voc = pd.DataFrame()
     #dfgram = self.grammemes()
     for i in range(n_min, n_max + 1):
         file_csv = env.filename_corpus_csv(i)
         try:
             dffile = pd.read_csv(file_csv,
                                  index_col='idcorpus',
                                  encoding='utf-8')
         except:
             env.debug(1, ['Failed to read corpus file:', file_csv])
         else:
             env.debug(1, ['Read OK:', file_csv])
             if not dffile.empty:
                 df_voc = df_voc.append(dffile)
     df_voc = df_voc.drop_duplicates()
     df_voc.columns = ['word', 'gram', 'idgram']
     df_voc = df_voc.reset_index(drop=True)
     df_voc.index.name = 'idcorpus'
     if persistent:
         file_voc = env.filename_vocabulary_csv()
         env.debug(1, ['Write vocabulary to CSV:', file_voc])
         df_voc.to_csv(file_voc, encoding='utf-8')
     return df_voc
Ejemplo n.º 8
0
 def corpus_xml2csv(self, num=1, persistent=True):
     env = Environment()
     file_xml = env.filename_corpus_xml(num)
     df_xml = pd.DataFrame()
     df_gram = self.grammemes()
     dgram = df_gram.to_dict().get('name')
     try:
         tree = ET.ElementTree(file=file_xml)
     except:
         env.debug(1, ['Failed to load XML:', file_xml])
     else:
         t_start = timer()
         env.debug(1, ['CORPUS', 'XML to CSV:', file_xml])
         for elem in tree.iter('token'):
             #print(elem.tag, elem.attrib)
             serie = pd.Series(data=[])
             badd = False
             s_text = elem.attrib.get('text')
             serie[len(serie)] = s_text.lower()
             for elem2 in elem.iter('g'):
                 #print(elem2.tag, elem2.attrib)
                 sgram = elem2.attrib.get('v')
                 sgram = sgram.upper()
                 if (df_gram[df_gram['name'].isin([sgram]) == True].size
                     ) > 0:
                     serie[len(serie)] = sgram
                     serie[len(serie)] = int(df_gram.index[
                         df_gram['name'] == sgram].tolist()[0])
                     #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)]
                     badd = True
                 break
             #print(s)
             if badd:
                 df_xml = df_xml.append(serie, ignore_index=True)
         if not df_xml.empty:
             df_xml = df_xml.drop_duplicates()
             df_xml = df_xml.reset_index(drop=True)
             df_xml.index.name = 'idcorpus'
             df_xml.columns = ['word', 'gram', 'idgram']
             df_xml = df_xml.astype({"idgram": int})
             if persistent:
                 file_csv = env.filename_corpus_csv(num)
                 env.debug(1, ['Write corpus file to CSV:', file_csv])
                 df_xml.to_csv(file_csv, encoding='utf-8')
                 t_end = timer()
                 env.debug(1, [
                     'CORPUS', 'CSV written:', file_csv,
                     'takes %s sec.' % env.job_time(t_start, t_end)
                 ])
     return df_xml
Ejemplo n.º 9
0
 def grammemes_xml2csv(self, persistent=True):
     env = Environment()
     filename_gram = env.filename_grammemes_xml()
     dfcols = ['name', 'alias', 'description']
     df_xml = pd.DataFrame(columns=dfcols)
     try:
         tree = ET.ElementTree(file=filename_gram)
     except:
         env.debug(1, ['Failed to load grammemes from XML:', filename_gram])
     else:
         env.debug(1, ['Read grammemes:', filename_gram])
         for elem in tree.iter('grammeme'):
             #print(elem.tag, elem.attrib)
             sattr = elem.attrib.get('include')
             if sattr == 'on':
                 sname = sali = sdesc = ''
                 for child in elem:
                     if child.tag.lower() == 'name':
                         sname = child.text.upper()
                     elif child.tag.lower() == 'alias':
                         sali = child.text.upper()
                     elif child.tag.lower() == 'description':
                         sdesc = child.text.lower()
                 s = pd.Series(data=[sname, sali, sdesc], index=dfcols)
                 df_xml = df_xml.append(s, ignore_index=True)
         df_xml.index.name = 'idgram'
         if persistent:
             filename_csv = env.filename_grammemes_csv()
             env.debug(1, ['Write grammemes to CSV:', filename_csv])
             df_xml.to_csv(filename_csv, encoding='utf-8')
     return df_xml
Ejemplo n.º 10
0
 def tokenz_create_stat(self, dftokenz=pd.DataFrame(), n_frac=1):
     env = Environment()
     enc = Word_Encoder()
     di_letters = Environment.di_bgm_byletters
     bgm_columns = env.bgm_columns_list(mode=1)
     t_start = timer()
     if dftokenz.empty:
         dftokenz = self.tokenz()
     if n_frac < 1:
         dftokenz = dftokenz.sample(frac=n_frac)
     env.debug(1, [
         'POStagger', 'create_stat',
         'Collecting statistic START %s words' % dftokenz.shape[0]
     ])
     di_tokenz_stat = (dftokenz.count()).to_dict()
     di_tokenz_res = {}
     #print('di_letters', di_letters)
     print('di_tokenz_stat', di_tokenz_stat)
     bgm_astat = [['init', 0]]
     bgm_index = []
     for key in di_letters:
         di_n = di_letters.get(key)
         column_stat = di_tokenz_stat.get(bgm_columns[di_n])
         #di_tokenz_res[key] = column_stat
         bgm_astat.append([key, column_stat])
         bgm_index.append(di_n)
     bgm_astat = bgm_astat[1:]
     print('column stat', bgm_astat)
     df_bgm_stat = pd.DataFrame(data=bgm_astat,
                                columns=['bigram', 'counts'],
                                index=bgm_index)
     df_bgm_stat.index.name = 'idbigram'
     df_bgm_stat = df_bgm_stat.sort_values(by=['counts'], ascending=False)
     print('bgm_stat\n', df_bgm_stat)
     df_bgm_stat.to_csv(env.filename_stat_bigram_letters_csv(),
                        encoding='utf-8')
Ejemplo n.º 11
0
 def vocabulary(self):
     env = Environment()
     file_voc = env.filename_vocabulary_csv()  #from vocabulary file
     file_dict = env.filename_dict_csv()  #from dictionary file
     try:
         df_voc = pd.read_csv(file_voc,
                              index_col='idcorpus',
                              encoding='utf-8')
     except:
         env.debug(1, ['Failed to read vocabulary file:', file_voc])
     else:
         env.debug(1, ['Read vocabulary OK:', file_voc])
     try:
         df_dict = pd.read_csv(file_dict,
                               index_col='idcorpus',
                               encoding='utf-8')
     except:
         env.debug(1, ['Failed to read dictionary file:', file_dict])
     else:
         env.debug(1, ['Read dictionary OK:', file_dict])
     #Concat
     df_res = pd.concat([df_voc, df_dict])
     df_res = df_res.drop_duplicates()
     #Apply patch words
     df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(),
                            index_col='idcorpus',
                            encoding='utf-8')
     df_res = df_res.drop(df_res[df_res['word'].isin(
         df_patch['word'])].index,
                          axis=0)
     df_res = pd.concat([df_res, df_patch])
     #print(df_res[df_res['word'].isin(df_patch['word'])])
     df_res = df_res.reset_index(drop=True)
     df_res.index.name = 'idcorpus'
     #print(df_res)
     return df_res
Ejemplo n.º 12
0
    def process_from_texts_file(self, aidtext, mode='process', max_words=0):
        env = Environment()
        file_res = env.filename_results_csv()
        dfres = pd.read_csv(
            file_res, index_col='idstat',
            encoding='utf-8')  #Файл для записи статистических результатов
        #dfres = env.downcast_dtypes(dfres)
        df_texts = pd.read_csv(env.filename_texts_csv(),
                               index_col='idtext',
                               encoding='utf-8')  #Реестр текстов
        mask = df_texts.index.isin(aidtext)
        df_texts = df_texts[mask]
        for index, row in df_texts.iterrows(
        ):  #Для каждого текста, который надо обработать
            file_txt = df_texts.at[index, 'filename']
            #Read text file
            env.debug(1, ['START file TXT:', file_txt])
            t_start = timer()
            #file = open(file_txt, 'r')
            file = codecs.open(file_txt, "r", "utf_8_sig")
            text = file.read().strip()
            file.close()
            # print(text)
            #Автор в обучающей выборке указанг
            idauthor = df_texts.at[index, 'idauthor']  #Автор
            name = df_texts.at[index, 'name']  #Название
            columns = dfres.columns
            if mode == 'process':  #если необходимо собрать информацию о тексте и записать её в results
                #Собственно обработка текста
                df_add = self.analyze_text(
                    columns, text, index, idauthor, name, file_txt,
                    max_words)  #Analyze text, get Series
                df_add.reset_index(drop=True, inplace=True)
                dfres = dfres.append(
                    df_add, ignore_index=True)  #Добавляем к файлу результатов
                dfres.reset_index(drop=True, inplace=True)
                dfres.index.name = 'idstat'
                #print(dfres)
                #return 0

            if mode == 'chunk_size':  # если необходимо определить размер chunk
                n_chunk_size = self.validate_chunk_size(
                    columns, text, index, idauthor, name, file_txt)
            t_end = timer()
            env.debug(1, [
                'END file TXT:', file_txt, 'time:',
                env.job_time(t_start, t_end)
            ])
            # print(dfres.head())
        #Сохраняем результат на диск
        if mode == 'process':
            #dfres = dfres.reset_index(drop=True)
            int_cols = [
                'idtext', 'idchunk', 'idauthor', 'sentences_text',
                'words_text', 'sentences_chunk', 'words_chunk',
                'words_uniq_chunk'
            ]
            for col in int_cols:
                dfres[col] = dfres[col].astype(int)
            #dfres = env.downcast_dtypes(dfres)
            dfres.to_csv(file_res, encoding='utf-8')
Ejemplo n.º 13
0
    def predict(self, aidtext, b_makestat=False):
        env = Environment()

        # Открываем файл со статистикой по тестовым текстам
        df_stat = pd.read_csv(
            env.filename_stat_test_csv(), index_col='idstat',
            encoding='utf-8')  # Статистика по тстовым текстам

        df_texts = pd.read_csv(env.filename_predict_csv(),
                               index_col='idtext',
                               encoding='utf-8')  # Реестр текстов
        mask = df_texts.index.isin(aidtext)
        df_texts = df_texts[mask]

        columns = ['idtext', 'idchunk', 'idauthor', 'author', 'name', 'file', \
                   'sentences_text', 'words_text','sentence_mean', \
                   'sentences_chunk', 'words_chunk',
                   'words_uniq_chunk','uniq_per_sent_chunk','uniq_per_words_chunk', \
                  'NOUN','ADJF','ADJS','COMP','VERB','INFN','PRTF','PRTS','GRND','NUMR',\
                  'ADVB','NPRO','PRED','PREP','CONJ','PRCL','INTJ', 'predict']
        y_result = []

        #Если необходимо подготовить статистику по тестовым текстам
        if b_makestat:
            for index, row in df_texts.iterrows(
            ):  # Для каждого текста, который надо обработать
                file_txt = df_texts.at[index, 'filename']
                # Read text file
                env.debug(1,
                          ['Analyzer', 'predict', 'START file TXT:', file_txt])
                t_start = timer()
                file = codecs.open(file_txt, "r", "utf_8_sig")
                text = file.read().strip()
                file.close()
                # Автор в тестовой выборке вообще говоря нет
                idauthor = df_texts.at[index, 'idauthor']  # Автор
                #idauthor = 0
                name = df_texts.at[index, 'name']  # Название

                # Собственно обработка текста
                df_add = self.analyze_text(
                    columns, text, index, idauthor, name,
                    file_txt)  # Analyze text, get Series
                #print(df_add)
                df_add.reset_index(drop=True, inplace=True)
                df_stat = df_stat.append(
                    df_add, ignore_index=True)  #Добавляем к файлу результатов
                df_stat.reset_index(drop=True, inplace=True)
                df_stat.index.name = 'idstat'
                t_end = timer()
                env.debug(1, [
                    'END file TXT:', file_txt, 'time:',
                    env.job_time(t_start, t_end)
                ])
            #df_stat теперь содержит информацию о всех тестовых текстах, которые хотели обработать
            #Указываем верный тип для целочисленных колонок
            int_cols = [
                'idtext', 'idchunk', 'idauthor', 'sentences_text',
                'words_text', 'sentences_chunk', 'words_chunk',
                'words_uniq_chunk'
            ]
            for col in int_cols:
                df_stat[col] = df_stat[col].astype(int)
            # Сохраняем результат на диск
            df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8')
        #Статистика готова

        # Открываем файл со статистикой по тестовым текстам
        df_stat = pd.read_csv(
            env.filename_stat_test_csv(), index_col='idstat',
            encoding='utf-8')  # Статистика по тстовым текстам
        #mask = df_stat.index.isin(aidtext)
        #df_stat2predict = df_stat[mask]
        #Предсказываем авторов
        y_res = self.model_predict(df_stat.loc[aidtext])
        #print(y_res)
        df_stat.loc[aidtext, 'predict'] = y_res.astype(int)
        #print(df_stat)
        #y_result.append(y_res[0])
        #Сохраняем измененный файл с предсказаниями
        df_stat.to_csv(env.filename_stat_test_csv(), encoding='utf-8')
        return y_res  #Возвращаем предсказания
Ejemplo n.º 14
0
    def tokenize(self, dftokenz=pd.DataFrame(), persistent=True, n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        t_start = timer()
        if dftokenz.empty:
            dftokenz = self.tokenz()
        if n_frac < 1:
            dftokenz = dftokenz.sample(frac=n_frac)
        env.debug(
            1, ['Transforming to tokenz: START %s words' % dftokenz.shape[0]])

        gmask = dftokenz.groupby(['gram'])
        df_posstat = gmask.count()
        df_posstat.to_csv(env.filename_stat_pos_tokenz_csv(), encoding='utf-8')
        print('POSTagger', 'train dataset stat:\n', gmask.count())

        fields = [
            's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3', 'n_token',
            'n_len', 'n_tokens2', 'n_tokens3', 'n_tokenp2', 'n_tokenp3'
        ]

        for field in fields:
            val = 0.0
            if field[0] == 's':
                val = ''
            dftokenz[field] = val

        n_letters = 0
        s_letters = env.list_rus_letters()
        di_letters = env.di_bgm_byletters
        #bgm_columns_i = env.bgm_columns_list(mode=0)
        bgm_columns = env.bgm_columns_list(mode=1)

        #print('bgm_columns', bgm_columns)
        for column_name in bgm_columns:
            dftokenz[column_name] = None

        t_end = timer()
        env.debug(1, [
            'POStagger', 'Letters bigram columns added',
            env.job_time(t_start, t_end)
        ])

        #Form tokenz
        t_start = timer()
        for index, serie in dftokenz.iterrows():
            # print (serie.values)
            a_word = enc.s2token(index, serie)
            i = 2
            # print(a_word)
            for field in fields:
                dftokenz.at[index, field] = a_word[i]
                # print(field, a_word[i])
                i = i + 1
            # print(dftokenz.loc[index])
            #Letters bigram binaries
            for n_l in range(0, len(a_word[0]) - 1):
                n_l2 = n_l + 1
                di_n = di_letters.get('%s%s' %
                                      (a_word[0][n_l], a_word[0][n_l2]))
                if di_n is not None:
                    #print(di_n)
                    #print(bgm_columns[di_n])
                    dftokenz.at[index, bgm_columns[di_n]] = 1
        t_end = timer()
        env.debug(
            1,
            ['Transforming to tokenz: COMPLETE',
             env.job_time(t_start, t_end)])
        if persistent:
            dftokenz.to_csv(env.filename_tokenz_csv(), encoding='utf-8')
            env.debug(1, ['Tokenz written to CSV:', env.filename_tokenz_csv()])
        return dftokenz
Ejemplo n.º 15
0
 def dict_xml2csv(self, persistent=True, lines=10000):
     t_start = timer()
     env = Environment()
     dfgram = self.grammemes()
     filename_dict = env.filename_dict_xml()
     dfcols = ['word', 'gram', 'idgram']
     df_xml = pd.DataFrame(columns=dfcols)
     env.debug(
         1, ['CORPUS', 'Start to load dictionary from XML:', filename_dict])
     try:
         fp = io.open(filename_dict, mode="r", encoding="utf-8")
     except:
         env.debug(1, [
             'CORPUS', 'Failed to open dictionary file XML:', filename_dict
         ])
     else:
         number_lines = sum(1 for line in fp)
         fp.seek(0)
         t_end = timer()
         env.debug(1, [
             'CORPUS', 'File opened:', 'lines',
             '%s' % number_lines, 'time:',
             env.job_time(t_start, t_end)
         ])
         t_start = timer()
         step = number_lines // lines
         env.debug(1, [
             'CORPUS', 'Read dictionary:', filename_dict,
             'lines: %s step %s' % (lines, step)
         ])
         n_line = 0
         for i in range(0, number_lines):
             line = fp.readline()
             #print(line[5:10])
             if (line[5:10] == 'lemma') and (n_line == 0):
                 #print(line)
                 tree = ET.fromstring(line)
                 for elem in tree.iter('l'):
                     s_word = elem.attrib.get('t')
                     gram = ['', 0]
                     j = 0
                     for elem2 in elem.iter('g'):
                         gram[j] = elem2.attrib.get('v')
                         break
                     gram[1] = int(dfgram.index[dfgram['name'] ==
                                                gram[0]].tolist()[0])
                 #print(s_word,gram)
                 s = pd.Series(data=[s_word, gram[0], gram[1]],
                               index=dfcols)
                 df_xml = df_xml.append(s, ignore_index=True)
                 n_line += 1
             n_line += 1
             if n_line >= step:
                 n_line = 0
         fp.close()
         df_xml.index.name = 'idcorpus'
         t_end = timer()
         env.debug(1, [
             'CORPUS', 'Dictionary loaded:', 'time:',
             env.job_time(t_start, t_end)
         ])
         if persistent:
             filename_csv = env.filename_dict_csv()
             env.debug(1,
                       ['CORPUS', 'Write dictionary to CSV:', filename_csv])
             df_xml.to_csv(filename_csv, encoding='utf-8')
             env.debug(1, ['CORPUS', 'Dictionary saved:', filename_csv])
     return df_xml
Ejemplo n.º 16
0
    def pos(self, df, mode_fast=True, use_cache=True):
        env = Environment()
        enc = Word_Encoder()
        df_res = df
        t_start = timer()

        c = OpenCorpus()
        g = c.grammemes()
        dg = g.to_dict().get('name')

        #Cache file
        cache_columns = ['word', 'gram_ml', 'count']
        file_cache = env.filename_mlcache_csv()
        try:
            df_cache = pd.read_csv(file_cache,
                                   index_col='idcorpus',
                                   encoding='utf-8')
        except:
            env.debug(
                1,
                ['POSTagger', 'pos', 'Failed to read cache file:', file_cache])
            df_cache = pd.DataFrame(columns=cache_columns)
        else:
            env.debug(1, ['POSTagger', 'pos', 'Read ML cache OK:', file_cache])

        a_predict = np.array([enc.word2token('')])
        #a_words = ['']
        n_words = df_res.shape[0]

        env.debug(1, [
            'POStagger', 'pos',
            'START Vocabulary prediction %s words' % n_words
        ])
        a_words = df_res['word'].tolist()
        a_ml_words = []
        predictions_voc = self.pos_by_voc(a_words)
        p_se = pd.Series(predictions_voc)
        df_res['gram'] = p_se.values
        df_res['gram_voc'] = p_se.values
        df_res['gram_ml'] = ''
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'END Vocabulary prediction %s sec.' % env.job_time(t_start, t_end)
        ])
        #print(predictions_voc)

        if mode_fast:
            #env.debug(1, ['POStagger', 'pos', 'START Fast mode vocabulary search. Words %s' % df.shape[0]])
            df_ni_voc = df_res[df_res['gram_voc'] == '']
            n_words = df_ni_voc.shape[0]
        else:
            df_ni_voc = df_res
        #print('non-vocabulary',df_ni_voc)
        if not df_ni_voc.empty:
            env.debug(
                1, ['POStagger', 'pos',
                    'START Encoding %s words' % n_words])
            for index, serie in df_ni_voc.iterrows():
                word = df_ni_voc.at[index, 'word']
                #print(word)
                a_padd = np.array([enc.word2token(word)])
                a_predict = np.append(a_predict, a_padd, axis=0)
                a_ml_words.append(word)
                #print(a_words, a_predict)
            a_predict = a_predict[1:, :]
            #print(a_predict)
            #print('ml_words',a_ml_words)
            t_end = timer()
            env.debug(1, [
                'POStagger', 'pos',
                'END Encoding %s words %s sec.' %
                (n_words, env.job_time(t_start, t_end))
            ])

        t_start = timer()
        env.debug(1, ['POStagger', 'pos', 'START Model prediction'])
        clf = pickle.load(open(env.filename_model_tree(), 'rb'))
        predictions_ml = clf.predict(a_predict[:, 0:])
        # print('ml', predictions_ml)
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'END Model prediction %s sec.' % env.job_time(t_start, t_end)
        ])
        #print('ml_words_prediction',list(zip(a_ml_words,predictions_ml)))

        t_start = timer()
        i = 0
        s_pvoc = ''
        s_pml = ''
        for index, row in df_res.iterrows():
            word = df_res.at[index, 'word']
            s_pvoc = df_res.at[index, 'gram_voc']
            #s_pvoc = predictions_voc[i]
            #print('s_pvoc', word, s_pvoc)
            #df_res.at[index, 'gram_voc'] = s_pvoc
            if s_pvoc == '':
                if mode_fast:
                    try:
                        j = a_ml_words.index(word)
                    except:
                        pass
                    else:
                        s_pml = dg.get(predictions_ml[j])
                        #print(word,s_pml)
                else:
                    s_pml = dg.get(predictions_ml[i])
                df_res.at[index, 'gram_ml'] = s_pml
                df_res.at[index, 'gram'] = s_pml
            i = i + 1
        t_end = timer()
        env.debug(1, [
            'POStagger', 'pos',
            'ML predictions dataframe filled %s sec' %
            env.job_time(t_start, t_end)
        ])
        #print(df_res)
        df_cache = pd.concat([
            df_cache,
            df_res[df_res.gram_ml != ''][['word', 'gram_ml', 'count']]
        ])
        df_cache = df_cache.groupby(['word',
                                     'gram_ml']).agg({'count': ['sum']})
        df_cache.reset_index(inplace=True)
        df_cache.index.name = 'idcorpus'
        df_cache.columns = cache_columns
        df_cache.sort_values(by=['count'], inplace=True, ascending=False)
        #print(df_cache)
        env.debug(1,
                  ['POStagger', 'pos', 'Write ML cache to CSV:', file_cache])
        df_cache.to_csv(file_cache, encoding='utf-8')
        return df_res
Ejemplo n.º 17
0
    def analyze_text(self,
                     columns,
                     text_to_analyze,
                     index=0,
                     idauthor=0,
                     name='',
                     file_txt='',
                     max_words=0):
        env = Environment()
        t_start = timer()
        env.debug(
            1, ['Analyzer', 'analyze_text',
                'START file TXT: %s' % file_txt])
        enc = Word_Encoder()
        postg = POSTagger()
        corpus = OpenCorpus()
        dfgram = corpus.grammemes()
        file_authors = env.filename_authors_csv()
        #Информация об авторах
        authors = pd.read_csv(file_authors,
                              index_col='idauthor',
                              encoding='utf-8')

        dfres = pd.DataFrame()  #Пустой dataframe для сохранения результат

        #Preprocessing: выполнить прдварительную обработку текста
        #max_words = 6000
        achunks = self.preprocessor(text_to_analyze, max_words)
        #print(achunks)
        n_chunks = len(achunks)  # на сколько частей разделён текст

        #на выходе получили массив, каждывй элемент которого содеоржит число предложений, число слов в тексте, массив со словами
        env.debug(1, [
            'Analyzer', 'analyze_text',
            '%s sentences %s words in %s chunks' %
            (achunks[0][0], achunks[0][1], n_chunks)
        ])
        #print(n_chunks)
        a_text_corp = []
        id_chunk = 0
        for chunk in achunks:
            t_start = timer()  #prepare data
            n_sent_all = chunk[0]
            n_words_all = chunk[1]
            n_sent_len_mean = chunk[2]
            n_sent_chunk = chunk[3]
            n_words_chunk = chunk[4]
            a_text_words = chunk[5]
            #print(n_sent_all, n_words_all, n_sent_len_mean, n_sent_chunk, n_words_chunk, a_text_words)
            #print(len(a_text_words))

            # Vectorize - к каждой части относимся как к индивидуальному тексту
            vectorizer = CountVectorizer(encoding='utf-8',
                                         token_pattern=r"(?u)\b\w+\b")
            #Преобразуем все слова в матрицу из одной строки (0) и множества колонок, где каждому слову соотвествует
            # колонка, а количество вхождений слова в документе - значение в этой колонке
            #print([' '.join(map(str,a_text_words))])
            X = vectorizer.fit_transform([' '.join(map(str, a_text_words))])
            #print(X)
            n_words_chunk_check = X.sum(
            )  #Сколько всего слов в документе, который обрабатываем
            #print(n_words_chunk, n_words_chunk_check)
            #print(vectorizer.get_stop_words())

            env.debug(1, [
                'Analyzer', 'analyze_text',
                'START process chunk %s/%s with %s words' %
                (id_chunk, n_chunks - 1, n_words_chunk)
            ])
            word_freq = np.asarray(X.sum(axis=0)).ravel(
            )  #для каждого слова его суммарное число (т.к. у нас одна строка == числу в ней)
            #print(vectorizer.get_feature_names())
            #print(X)
            zl = zip(vectorizer.get_feature_names(), word_freq)  # words, count
            #print(list(zl))

            data_cols = ['gram', 'gram_voc', 'gram_ml']
            data = pd.DataFrame(list(zl), columns=['word', 'count'])
            for col in data_cols:
                data[col] = ''
            t_end = timer()
            env.debug(
                1, ['Ready for POS:', 'time:',
                    env.job_time(t_start, t_end)])

            t_start = timer()
            data = postg.pos(data)
            #print(data)
            t_end = timer()
            env.debug(1,
                      ['POS tagged:', 'time:',
                       env.job_time(t_start, t_end)])

            t_start = timer()
            grouped = data.sort_values('gram').groupby(['gram']).agg(
                {'count': ['sum']})
            grouped.columns = ['n_POS']
            grouped.reset_index(inplace=True)
            grouped['f_POS'] = grouped['n_POS'] / n_words_chunk
            #grouped.drop(columns=['n_POS'], inplace=True)
            #print(grouped)
            #print(grouped.set_index('gram').T)
            grouped = pd.merge(
                dfgram, grouped, left_on='name', right_on='gram',
                how='left').drop(
                    columns=['alias', 'description', 'name', 'n_POS']).fillna(
                        0).set_index('gram').T
            #grouped = pd.merge(dfgram, grouped, left_on='name', right_on='gram', how='left').fillna(0).set_index('gram')
            #print(grouped)
            #print(grouped.values.ravel())
            index_author = authors.index.get_loc(idauthor)
            n_uniq_words = data.shape[0]
            s_chunk = pd.Series({
                'idtext':
                index,
                'idchunk':
                id_chunk,
                'idauthor':
                idauthor,
                'author':
                authors.at[index_author, 'shortname'],
                'name':
                name,
                'file':
                file_txt,
                'sentences_text':
                np.int64(n_sent_all),
                'words_text':
                np.int64(n_words_all),
                'sentence_mean':
                n_sent_len_mean,
                'sentences_chunk':
                np.int64(n_sent_chunk),
                'words_chunk':
                np.int64(n_words_chunk),
                'words_uniq_chunk':
                np.int64(n_uniq_words),
                'uniq_per_sent_chunk':
                round(n_uniq_words / n_sent_chunk, 4),
                'uniq_per_words_chunk':
                round(n_uniq_words / n_words_chunk, 4)
            })
            s_chunk = pd.concat(
                [s_chunk, pd.Series(grouped.values.ravel())],
                ignore_index=True)
            s_chunk = pd.concat([s_chunk, pd.Series([np.nan])],
                                ignore_index=True)
            #print(s_chunk)
            #print(grouped)
            t_end = timer()
            env.debug(1, ['Analyzed', 'time:', env.job_time(t_start, t_end)])
            dfres = dfres.append(s_chunk, ignore_index=True)
            #dfres = env.downcast_dtypes(dfres)
            id_chunk = id_chunk + 1
        print(dfres)
        print(columns)
        dfres.columns = columns
        return dfres
Ejemplo n.º 18
0
    def train(self,
              df=pd.DataFrame(),
              validation='eval',
              n_splits=5,
              b_smoketest=True,
              n_frac=1):
        env = Environment()
        enc = Word_Encoder()
        df_train = df
        bgm_columns = env.bgm_columns_list(mode=1)
        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  #, 'bgm_l_None'
        #drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(1,
                  ['POStagger', 'train',
                   'Drop colums: %s' % (drop_columns)])

        if df_train.empty:
            t_start = timer()
            df_train = self.tokenz()
            t_end = timer()
            env.debug(1, [
                'POSTagger', 'train', 'tokenz loaded:', 'time:',
                env.job_time(t_start, t_end)
            ])

        env.debug(1, [
            'POStagger', 'train',
            'All tokenz set shape %s' % df_train.shape[0]
        ])
        t_start = timer()
        env.debug(1, ['POStagger', 'train', 'Learning: START'])
        if n_frac < 1:
            df_train = df_train.sample(frac=n_frac)
            env.debug(1, [
                'POStagger', 'train',
                'Training tokenz set shape %s' % df_train.shape[0]
            ])
            #print(df_train.shape)

        #df_train2 = df_train[bgm_columns]
        #print(df_train2.shape)
        #df_train2 = df_train2.astype({"idgram": int})
        df_train = df_train.drop(columns=drop_columns, axis=1)
        env.debug(
            1, ['POStagger',
                'Train colums: %s' % (df_train.columns.tolist())])
        #print(df_train.columns)

        #df_train = df_train.drop_duplicates() #slow-slow
        #print(df_train.head())

        df_train = df_train.fillna(0)
        file_x = env.filename_xtrain_csv()
        df_train.to_csv(file_x, encoding='utf-8')
        env.debug(1, ['POStagger', 'train', 'Save X', file_x])
        y = df_train['idgram'].values
        df_train.drop(columns=['idgram'], inplace=True)
        X = df_train.values
        #array = df_train.values
        #print(df_train)
        #X = array[:, 1:]
        #Y = array[:, 0]

        #print(X, Y)
        #validation_size = 0.20
        seed = 241
        frac_test_size = 0.2

        sc = StandardScaler()
        #Y_sc = sc.fit_transform(Y)
        t2_start = timer()
        if validation == 'cv':  #Need cross-validation
            scoring = 'accuracy'
            # scoring = 'f1_samples'
            kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
            if True:  #Decision tree
                env.debug(1, ['Tree cross-validation'])
                # clf = DecisionTreeClassifier(criterion='gini', random_state=seed)  # 0.79
                # clf = KNeighborsClassifier(n_neighbors=230)
                model = DecisionTreeClassifier(criterion='entropy',
                                               random_state=seed)  # 0.81
                env.debug(
                    1, ['Calculate cross_val_score. Splits=%s' % (n_splits)])
                scores = cross_val_score(model, X, y, cv=kf)
                print('DTree scores:', scores.mean(), 'raw', scores)

            if False:  #Logistic regression
                env.debug(1, ['LGR cross-validation'])
                n_Cs = [0.01]
                X = array[:, 5:]
                X_sc = sc.fit_transform(X)
                Y = df_train['idgram'].values
                Y[Y > 0] = 1
                print(X_sc, Y)
                for n_c in n_Cs:
                    #clf = LogisticRegression(penalty='l2', solver='saga', C=n_c, multi_class='multinomial')
                    clf = LogisticRegression(penalty='l2',
                                             solver='liblinear',
                                             C=n_c)
                    # clf = SVC(kernel='linear', C=10000, random_state=241)
                    # clf = SVC(kernel='linear', C=0.01, random_state=seed)
                    # clf = SVC(random_state=seed)
                    # clf = Perceptron()
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s C=%s' %
                        (n_splits, n_c)
                    ])
                    scores = cross_val_score(clf, X_sc, Y, cv=kf)
                    print(scores)

            if False:  #GBM, RandomForest
                env.debug(1, ['GBM cross-validation'])
                asteps = [20]  #GBM
                #asteps=[100] #RandomForest
                for i in asteps:
                    #clf = RandomForestClassifier(n_estimators=i)
                    clf = GradientBoostingClassifier(
                        n_estimators=i, max_depth=8)  #, max_features='sqrt'
                    env.debug(1, [
                        'Calculate cross_val_score. Splits=%s Estimators=%s' %
                        (n_splits, i)
                    ])
                    scores = cross_val_score(clf, X, Y, cv=kf)
                    print(scores)

        if validation == 'eval':
            # eval
            model = xgb.XGBClassifier(n_estimators=140,
                                      max_depth=16,
                                      colsample=1,
                                      subsample=0.5,
                                      seed=seed)
            X_train, X_test, y_train, y_test = train_test_split(
                X,
                y,
                test_size=frac_test_size,
                random_state=seed,
                shuffle=True)
            eval_set = [(X_train, y_train), (X_test, y_test)]
            # print(eval_set)
            f_eval = 'merror'
            # f_eval = 'mlogloss'
            model.fit(X_train,
                      y_train,
                      eval_metric=f_eval,
                      eval_set=eval_set,
                      verbose=False,
                      early_stopping_rounds=20)
            ev_scores = model.evals_result()
            ev_mean = np.array(ev_scores['validation_0'][f_eval]).mean()
            #print(model.feature_importances_)
            print(ev_mean, ev_scores)
            xgb.plot_importance(model)
            plt.show()
        t2_end = timer()
        t_end = timer()
        env.debug(1, ['CV completed:', 'time:', env.job_time(t_start, t_end)])

        if validation == 'cv':
            #Training на всех данных
            X_train, y_train = X, y

            # model = SVC()
            # model= DecisionTreeClassifier() #79
            # model= LinearDiscriminantAnalysis() #47
            # model=LogisticRegression() #48
            # model = KNeighborsClassifier(n_neighbors=200) #48
            # model = GaussianNB()   #43
            #print('Fit...')

            #print('Validate...')
            # predictions = model.predict(X_validation)

            # print(accuracy_score(Y_validation, predictions))
            # print(confusion_matrix(Y_validation, predictions))
            # print(classification_report(Y_validation, predictions))

            t_start = timer()
            env.debug(1, ['Training: START'])
            model.fit(X_train, y_train)
            t_end = timer()
            env.debug(1, ['Training: END', env.job_time(t_start, t_end)])

        pickle.dump(sc, open(env.filename_scaler(), 'wb'))
        pickle.dump(model, open(env.filename_model_tree(), 'wb'))

        # Smoke test
        if b_smoketest:
            X_smoke_predict = [
                'съеште', 'ещё', 'этих', 'мягких', 'французских', 'булок'
            ]
            a_smoke = np.array(
                [enc.word2token(elem) for elem in X_smoke_predict])
            y_predictions = model.predict(a_smoke[:, 0:])
            y_predictions_proba = model.predict(a_smoke[:, 0:])
            #print(y_predictions)
            print('Prediction', list(zip(X_smoke_predict, y_predictions)))
            print('Proba', list(zip(X_smoke_predict, y_predictions_proba)))
        return model
Ejemplo n.º 19
0
    def vizualize2d(self, n_frac=0.01, b_annotations=False):
        n_components = 2
        env = Environment()
        c = OpenCorpus()
        di_g = c.grammemes(mode=1)
        data = self.tokenz().sample(frac=n_frac)

        data = data.fillna(0)
        #print(data['idgram'].shape)
        #print(data.index.shape)
        tdf = pd.DataFrame(index=data.index)
        tdf['idgram'] = data['idgram']
        tdf['gram'] = data['gram']
        tdf['word'] = data['word']
        #print(tdf)

        drop_columns = [
            'word', 'gram', 's_suffix2', 's_suffix3', 's_prefix2', 's_prefix3',
            'n_token'
        ]  # , 'bgm_l_None'
        # drop_columns.extend(['bgm_l_%s' % (i) for i in range(1, env.bgm_columns_max()) if 'bgm_l_%s' % (i) not in bgm_columns])
        env.debug(
            1,
            ['POStagger', 'visualize2D',
             'Drop colums: %s' % (drop_columns)])
        data = data.drop(columns=drop_columns, axis=1)
        values = data.values
        X = values[:, 1:]
        y = values[:, 0]
        #print(data.head,X, y)
        #return 0

        #Scalers
        sc = StandardScaler()
        min_max_scaler = preprocessing.MinMaxScaler()
        max_abs_scaler = preprocessing.MaxAbsScaler()
        #X = sc.fit_transform(X)

        #PCA
        b_pca = False
        b_sne = True
        if b_pca:
            model = PCA(n_components=n_components)
        if b_sne:
            model = MDS(n_components=n_components)  #TSNE
        X_new = model.fit_transform(X, y)
        if b_pca:
            print('PCA ratio', n_components, 'components',
                  model.explained_variance_ratio_)
        #X_new = sc.fit_transform(X_new)
        #X_new = preprocessing.scale(X_new)
        if b_pca:
            X_new = max_abs_scaler.fit_transform(X_new)
        #return 0

        #tdf = pd.DataFrame(data=X_new, columns=['PC1', 'PC2'], index=data.index)
        tdf['PC1'] = X_new[:, 0]
        tdf['PC2'] = X_new[:, 1]
        #finalDf = pd.concat([tdf, data[['idgram']]], axis=1)
        df_groups = tdf.groupby('idgram').count()
        #print(df_groups)
        #return 0
        tdf['counts'] = 0
        for index, serie in tdf.iterrows():
            n_idgram = tdf.at[index, 'idgram']
            tdf.at[index,
                   'counts'] = df_groups[df_groups.index == n_idgram]['gram']
        tdf = tdf.sort_values(by=['counts'], ascending=False)
        #print(tdf)

        #Draw
        i = 0
        N = df_groups.shape[0]
        s_title = ''
        if b_pca:
            s_title = '2 component PCA. Точность %s' % (round(
                sum(float(i) for i in model.explained_variance_ratio_), 2))
        if b_sne:
            s_title = 't-SNE'

        #Plotly
        if False:  #Plotly
            py.sign_in('shashmaxus', 'AdfwTulrOoV3cSlbZT3B')
            c = [
                'hsl(' + str(h) + ',50%' + ',50%)'
                for h in np.linspace(0, 360, N)
            ]
            data_trace = []
            for index, row in df_groups.iterrows():
                #print(index)
                df_trace = tdf[tdf['idgram'] == index]
                #print(df_trace)
                g_trace = go.Scatter(
                    x=df_trace['PC1'].values,
                    y=df_trace['PC2'].values,
                    name=df_trace['gram'].values[0],
                    mode='markers',  #'markers+text'
                    marker=dict(
                        size=8,
                        color=i,  #c[i]
                        opacity=0.8,
                        colorscale='Viridis'),
                    text=df_trace['word'],
                    textfont=dict(family='sans serif', size=12))
                data_trace.append(g_trace)
                i += 1
            layout = go.Layout(
                title=s_title_pca,
                xaxis=dict(
                    title=('Component 1. Вклад %s' %
                           (round(pca.explained_variance_ratio_[0], 2)))),
                yaxis=dict(
                    title=('Component 2. Вклад %s' %
                           (round(pca.explained_variance_ratio_[1], 2)))))
            fig2 = go.Figure(data=data_trace, layout=layout)
            py.image.save_as(fig2,
                             filename='c:/prj/mlivos_data/temp/Words2.png')

        #Bokeh
        if True:
            palette = d3['Category20'][len(tdf['gram'].unique())]
            #palette = all_palettes['Category20'][len(tdf['gram'].unique())]
            #palette = Viridis256[len(tdf['gram'].unique())]
            #palette = Viridis256
            color_map = CategoricalColorMapper(factors=tdf['gram'].unique(),
                                               palette=palette)
            #print(mapper)
            fig = figure(title=s_title, toolbar_location=None)
            source = ColumnDataSource(tdf[['gram', 'PC1', 'PC2']])
            fig.scatter(x='PC1',
                        y='PC2',
                        size=12,
                        color={
                            'field': 'gram',
                            'transform': color_map
                        },
                        legend='gram',
                        source=source)
            show(fig)
            export_png(fig, filename="c:/prj/mlivos_data/temp/PCA.png")
        return 0
Ejemplo n.º 20
0
    def preprocessor(self, text, max_words=0):
        env = Environment()
        t_start = timer()
        text2 = text.lower()
        env.debug(1, ['Analyzer', 'preprocessor', 'START Preprocessing:'])
        tokenizer = RegexpTokenizer(self.word_tokenizers_custom())
        tokens_words = tokenizer.tokenize(text2)  # Слова текста
        tokens_sent = sent_tokenize(
            text2)  # Предложения - пока не используются в нашем проекте

        n_words_count = len(tokens_words)  # Количество слов в тексте
        n_sent_count = len(tokens_sent)  # Количество предложений в тексте
        n_sent_len_mean = n_words_count / n_sent_count  # Средняя длина предложения в словах

        #Делим текст на части - chunks
        awords = []  #Массив
        # Если документ большой, разделяем его на несколько частей (chunks) и считаем
        # статистику для каждого в отдельности.
        # Это нам позволит имея небольшое число объёмных документов корректно обучить модель
        if (max_words > 0):
            n_sent_chunk = int(
                max_words // n_sent_len_mean
            )  #Сколько предложение в 1 chunks содержащее max_words

            print('n_sent_chunk', n_sent_chunk)
            #подбираем, чтобы текст был разделен равномерно
            i_chunks = 1
            tmp_sent_chunk = n_sent_count
            while tmp_sent_chunk > n_sent_chunk:
                i_chunks = i_chunks + 1
                tmp_sent_chunk = int(
                    math.ceil(n_sent_count // i_chunks) +
                    (n_sent_count % i_chunks))

            n = 0
            n_sent_chunk = tmp_sent_chunk  #итоговое значение сколько предложений пойдет в chunk
            print('tmp_sent_chunk', tmp_sent_chunk)

            while n < n_sent_count:
                #print(n, n_sent_chunk)
                asents = tokens_sent[
                    n:n + n_sent_chunk]  #Предложения от n до n+chunk
                #print(asents)
                a_sent_words = []  #слова текущей группы предложений
                for sent in asents:
                    words = tokenizer.tokenize(sent)
                    a_sent_words.extend(words)
                #print(a_sent_words)
                awords.append([
                    n_sent_count, n_words_count,
                    len(a_sent_words) / len(asents),
                    len(asents),
                    len(a_sent_words), a_sent_words
                ])
                n = n + n_sent_chunk
        else:
            awords.append([
                n_sent_count, n_words_count, n_sent_len_mean,
                len(tokens_sent),
                len(tokens_words), tokens_words
            ])
        #print(awords)
        t_end = timer()
        env.debug(1, ['Preprocessed:', 'time:', env.job_time(t_start, t_end)])
        return awords  #Массив со словами и статистикой