def corpus_xml2csv(self, num=1, persistent=True): env = Environment() file_xml = env.filename_corpus_xml(num) df_xml = pd.DataFrame() df_gram = self.grammemes() dgram = df_gram.to_dict().get('name') try: tree = ET.ElementTree(file=file_xml) except: env.debug(1, ['Failed to load XML:', file_xml]) else: t_start = timer() env.debug(1, ['CORPUS', 'XML to CSV:', file_xml]) for elem in tree.iter('token'): #print(elem.tag, elem.attrib) serie = pd.Series(data=[]) badd = False s_text = elem.attrib.get('text') serie[len(serie)] = s_text.lower() for elem2 in elem.iter('g'): #print(elem2.tag, elem2.attrib) sgram = elem2.attrib.get('v') sgram = sgram.upper() if (df_gram[df_gram['name'].isin([sgram]) == True].size ) > 0: serie[len(serie)] = sgram serie[len(serie)] = int(df_gram.index[ df_gram['name'] == sgram].tolist()[0]) #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)] badd = True break #print(s) if badd: df_xml = df_xml.append(serie, ignore_index=True) if not df_xml.empty: df_xml = df_xml.drop_duplicates() df_xml = df_xml.reset_index(drop=True) df_xml.index.name = 'idcorpus' df_xml.columns = ['word', 'gram', 'idgram'] df_xml = df_xml.astype({"idgram": int}) if persistent: file_csv = env.filename_corpus_csv(num) env.debug(1, ['Write corpus file to CSV:', file_csv]) df_xml.to_csv(file_csv, encoding='utf-8') t_end = timer() env.debug(1, [ 'CORPUS', 'CSV written:', file_csv, 'takes %s sec.' % env.job_time(t_start, t_end) ]) return df_xml
def corpus_xml2txt(self, num=1, persistent=True): result = True env = Environment() file_xml = env.filename_corpus_xml(num) try: tree = ET.ElementTree(file=file_xml) except: env.debug(1, ['Failed to load XML:', file_xml]) result = False else: file_txt = env.filename_corpus_txt(num) file = open(file_txt, mode='w') for elem in tree.iter('source'): # print(elem.text, elem.tag, elem.attrib) file.write(elem.text) file.write(' ') file.close() env.debug(1, ['Write corpus file to TXT:', file_txt]) return result