コード例 #1
0
ファイル: corpus.py プロジェクト: shashmaxus/mlivos
 def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True):
     env = Environment()
     df_voc = pd.DataFrame()
     #dfgram = self.grammemes()
     for i in range(n_min, n_max + 1):
         file_csv = env.filename_corpus_csv(i)
         try:
             dffile = pd.read_csv(file_csv,
                                  index_col='idcorpus',
                                  encoding='utf-8')
         except:
             env.debug(1, ['Failed to read corpus file:', file_csv])
         else:
             env.debug(1, ['Read OK:', file_csv])
             if not dffile.empty:
                 df_voc = df_voc.append(dffile)
     df_voc = df_voc.drop_duplicates()
     df_voc.columns = ['word', 'gram', 'idgram']
     df_voc = df_voc.reset_index(drop=True)
     df_voc.index.name = 'idcorpus'
     if persistent:
         file_voc = env.filename_vocabulary_csv()
         env.debug(1, ['Write vocabulary to CSV:', file_voc])
         df_voc.to_csv(file_voc, encoding='utf-8')
     return df_voc
コード例 #2
0
ファイル: postagger.py プロジェクト: shashmaxus/mlivos
 def test(self, n_min=1, n_max=1):
     t_start = timer()
     env = Environment()
     df_test = pd.DataFrame()
     for i in range(n_min, n_max + 1):
         try:
             dffile = pd.read_csv(env.filename_corpus_csv(i),
                                  index_col='idcorpus',
                                  encoding='utf-8')
         except:
             env.debug(1, [
                 'POStagger', 'test', 'Failed to read corpus file:',
                 env.filename_corpus_csv(i)
             ])
         else:
             env.debug(1, [
                 'POStagger', 'test', 'Read OK:',
                 env.filename_corpus_csv(i)
             ])
             if not dffile.empty:
                 df_test = df_test.append(dffile)
     df_test = df_test.drop_duplicates()
     df_test.columns = ['word', 'gram', 'idgram']
     df_test = df_test.reset_index(drop=True)
     df_test.index.name = 'idcorpus'
     df_test['gram_valid'] = df_test['gram']
     n_testsize = df_test.shape[0]
     env.debug(1, ['POStagger', 'test', 'START %s words' % n_testsize])
     df_test = self.pos(df_test)
     print('Test result', df_test)
     df_err = df_test[df_test['gram_valid'] != df_test['gram']]
     print('Test errors:', df_err)
     df_err.to_csv(env.filename_test_err_csv(), encoding='utf-8')
     env.debug(1, [
         'POStagger', 'test',
         'test accuracy %s' % (1 - df_err.shape[0] / n_testsize)
     ])
     t_end = timer()
     env.debug(1, [
         'POSTagger', 'test', 'test time:',
         env.job_time(t_start, t_end), 'sec.'
     ])
コード例 #3
0
ファイル: corpus.py プロジェクト: shashmaxus/mlivos
 def corpus_xml2csv(self, num=1, persistent=True):
     env = Environment()
     file_xml = env.filename_corpus_xml(num)
     df_xml = pd.DataFrame()
     df_gram = self.grammemes()
     dgram = df_gram.to_dict().get('name')
     try:
         tree = ET.ElementTree(file=file_xml)
     except:
         env.debug(1, ['Failed to load XML:', file_xml])
     else:
         t_start = timer()
         env.debug(1, ['CORPUS', 'XML to CSV:', file_xml])
         for elem in tree.iter('token'):
             #print(elem.tag, elem.attrib)
             serie = pd.Series(data=[])
             badd = False
             s_text = elem.attrib.get('text')
             serie[len(serie)] = s_text.lower()
             for elem2 in elem.iter('g'):
                 #print(elem2.tag, elem2.attrib)
                 sgram = elem2.attrib.get('v')
                 sgram = sgram.upper()
                 if (df_gram[df_gram['name'].isin([sgram]) == True].size
                     ) > 0:
                     serie[len(serie)] = sgram
                     serie[len(serie)] = int(df_gram.index[
                         df_gram['name'] == sgram].tolist()[0])
                     #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)]
                     badd = True
                 break
             #print(s)
             if badd:
                 df_xml = df_xml.append(serie, ignore_index=True)
         if not df_xml.empty:
             df_xml = df_xml.drop_duplicates()
             df_xml = df_xml.reset_index(drop=True)
             df_xml.index.name = 'idcorpus'
             df_xml.columns = ['word', 'gram', 'idgram']
             df_xml = df_xml.astype({"idgram": int})
             if persistent:
                 file_csv = env.filename_corpus_csv(num)
                 env.debug(1, ['Write corpus file to CSV:', file_csv])
                 df_xml.to_csv(file_csv, encoding='utf-8')
                 t_end = timer()
                 env.debug(1, [
                     'CORPUS', 'CSV written:', file_csv,
                     'takes %s sec.' % env.job_time(t_start, t_end)
                 ])
     return df_xml