def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True): env = Environment() df_voc = pd.DataFrame() #dfgram = self.grammemes() for i in range(n_min, n_max + 1): file_csv = env.filename_corpus_csv(i) try: dffile = pd.read_csv(file_csv, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read corpus file:', file_csv]) else: env.debug(1, ['Read OK:', file_csv]) if not dffile.empty: df_voc = df_voc.append(dffile) df_voc = df_voc.drop_duplicates() df_voc.columns = ['word', 'gram', 'idgram'] df_voc = df_voc.reset_index(drop=True) df_voc.index.name = 'idcorpus' if persistent: file_voc = env.filename_vocabulary_csv() env.debug(1, ['Write vocabulary to CSV:', file_voc]) df_voc.to_csv(file_voc, encoding='utf-8') return df_voc
def test(self, n_min=1, n_max=1): t_start = timer() env = Environment() df_test = pd.DataFrame() for i in range(n_min, n_max + 1): try: dffile = pd.read_csv(env.filename_corpus_csv(i), index_col='idcorpus', encoding='utf-8') except: env.debug(1, [ 'POStagger', 'test', 'Failed to read corpus file:', env.filename_corpus_csv(i) ]) else: env.debug(1, [ 'POStagger', 'test', 'Read OK:', env.filename_corpus_csv(i) ]) if not dffile.empty: df_test = df_test.append(dffile) df_test = df_test.drop_duplicates() df_test.columns = ['word', 'gram', 'idgram'] df_test = df_test.reset_index(drop=True) df_test.index.name = 'idcorpus' df_test['gram_valid'] = df_test['gram'] n_testsize = df_test.shape[0] env.debug(1, ['POStagger', 'test', 'START %s words' % n_testsize]) df_test = self.pos(df_test) print('Test result', df_test) df_err = df_test[df_test['gram_valid'] != df_test['gram']] print('Test errors:', df_err) df_err.to_csv(env.filename_test_err_csv(), encoding='utf-8') env.debug(1, [ 'POStagger', 'test', 'test accuracy %s' % (1 - df_err.shape[0] / n_testsize) ]) t_end = timer() env.debug(1, [ 'POSTagger', 'test', 'test time:', env.job_time(t_start, t_end), 'sec.' ])
def corpus_xml2csv(self, num=1, persistent=True): env = Environment() file_xml = env.filename_corpus_xml(num) df_xml = pd.DataFrame() df_gram = self.grammemes() dgram = df_gram.to_dict().get('name') try: tree = ET.ElementTree(file=file_xml) except: env.debug(1, ['Failed to load XML:', file_xml]) else: t_start = timer() env.debug(1, ['CORPUS', 'XML to CSV:', file_xml]) for elem in tree.iter('token'): #print(elem.tag, elem.attrib) serie = pd.Series(data=[]) badd = False s_text = elem.attrib.get('text') serie[len(serie)] = s_text.lower() for elem2 in elem.iter('g'): #print(elem2.tag, elem2.attrib) sgram = elem2.attrib.get('v') sgram = sgram.upper() if (df_gram[df_gram['name'].isin([sgram]) == True].size ) > 0: serie[len(serie)] = sgram serie[len(serie)] = int(df_gram.index[ df_gram['name'] == sgram].tolist()[0]) #serie[len(serie)] = list(dgram.keys())[list(dgram.values()).index(sgram)] badd = True break #print(s) if badd: df_xml = df_xml.append(serie, ignore_index=True) if not df_xml.empty: df_xml = df_xml.drop_duplicates() df_xml = df_xml.reset_index(drop=True) df_xml.index.name = 'idcorpus' df_xml.columns = ['word', 'gram', 'idgram'] df_xml = df_xml.astype({"idgram": int}) if persistent: file_csv = env.filename_corpus_csv(num) env.debug(1, ['Write corpus file to CSV:', file_csv]) df_xml.to_csv(file_csv, encoding='utf-8') t_end = timer() env.debug(1, [ 'CORPUS', 'CSV written:', file_csv, 'takes %s sec.' % env.job_time(t_start, t_end) ]) return df_xml