コード例 #1
0
ファイル: corpus.py プロジェクト: shashmaxus/mlivos
 def corpus_xml2csv(self, num=1, persistent=True):
     env = Environment()
     file_xml = env.filename_corpus_xml(num)
     df_xml = pd.DataFrame()
     df_gram = self.grammemes()
     dgram = df_gram.to_dict().get('name')
     try:
         tree = ET.ElementTree(file=file_xml)
     except:
         env.debug(1, ['Failed to load XML:', file_xml])
     else:
         t_start = timer()
         env.debug(1, ['CORPUS', 'XML to CSV:', file_xml])
         for elem in tree.iter('token'):
             #print(elem.tag, elem.attrib)
             serie = pd.Series(data=[])
             badd = False
             s_text = elem.attrib.get('text')
             serie[len(serie)] = s_text.lower()
             for elem2 in elem.iter('g'):
                 #print(elem2.tag, elem2.attrib)
                 sgram = elem2.attrib.get('v')
                 sgram = sgram.upper()
                 if (df_gram[df_gram['name'].isin([sgram]) == True].size
                     ) > 0:
                     serie[len(serie)] = sgram
                     serie[len(serie)] = int(df_gram.index[
                         df_gram['name'] == sgram].tolist()[0])
                     #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)]
                     badd = True
                 break
             #print(s)
             if badd:
                 df_xml = df_xml.append(serie, ignore_index=True)
         if not df_xml.empty:
             df_xml = df_xml.drop_duplicates()
             df_xml = df_xml.reset_index(drop=True)
             df_xml.index.name = 'idcorpus'
             df_xml.columns = ['word', 'gram', 'idgram']
             df_xml = df_xml.astype({"idgram": int})
             if persistent:
                 file_csv = env.filename_corpus_csv(num)
                 env.debug(1, ['Write corpus file to CSV:', file_csv])
                 df_xml.to_csv(file_csv, encoding='utf-8')
                 t_end = timer()
                 env.debug(1, [
                     'CORPUS', 'CSV written:', file_csv,
                     'takes %s sec.' % env.job_time(t_start, t_end)
                 ])
     return df_xml
コード例 #2
0
ファイル: corpus.py プロジェクト: shashmaxus/mlivos
 def corpus_xml2txt(self, num=1, persistent=True):
     result = True
     env = Environment()
     file_xml = env.filename_corpus_xml(num)
     try:
         tree = ET.ElementTree(file=file_xml)
     except:
         env.debug(1, ['Failed to load XML:', file_xml])
         result = False
     else:
         file_txt = env.filename_corpus_txt(num)
         file = open(file_txt, mode='w')
         for elem in tree.iter('source'):
             # print(elem.text, elem.tag, elem.attrib)
             file.write(elem.text)
             file.write(' ')
         file.close()
         env.debug(1, ['Write corpus file to TXT:', file_txt])
     return result