def vocabulary(self): env = Environment() file_voc = env.filename_vocabulary_csv() #from vocabulary file file_dict = env.filename_dict_csv() #from dictionary file try: df_voc = pd.read_csv(file_voc, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read vocabulary file:', file_voc]) else: env.debug(1, ['Read vocabulary OK:', file_voc]) try: df_dict = pd.read_csv(file_dict, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read dictionary file:', file_dict]) else: env.debug(1, ['Read dictionary OK:', file_dict]) #Concat df_res = pd.concat([df_voc, df_dict]) df_res = df_res.drop_duplicates() #Apply patch words df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(), index_col='idcorpus', encoding='utf-8') df_res = df_res.drop(df_res[df_res['word'].isin( df_patch['word'])].index, axis=0) df_res = pd.concat([df_res, df_patch]) #print(df_res[df_res['word'].isin(df_patch['word'])]) df_res = df_res.reset_index(drop=True) df_res.index.name = 'idcorpus' #print(df_res) return df_res
def dict_xml2csv(self, persistent=True, lines=10000): t_start = timer() env = Environment() dfgram = self.grammemes() filename_dict = env.filename_dict_xml() dfcols = ['word', 'gram', 'idgram'] df_xml = pd.DataFrame(columns=dfcols) env.debug( 1, ['CORPUS', 'Start to load dictionary from XML:', filename_dict]) try: fp = io.open(filename_dict, mode="r", encoding="utf-8") except: env.debug(1, [ 'CORPUS', 'Failed to open dictionary file XML:', filename_dict ]) else: number_lines = sum(1 for line in fp) fp.seek(0) t_end = timer() env.debug(1, [ 'CORPUS', 'File opened:', 'lines', '%s' % number_lines, 'time:', env.job_time(t_start, t_end) ]) t_start = timer() step = number_lines // lines env.debug(1, [ 'CORPUS', 'Read dictionary:', filename_dict, 'lines: %s step %s' % (lines, step) ]) n_line = 0 for i in range(0, number_lines): line = fp.readline() #print(line[5:10]) if (line[5:10] == 'lemma') and (n_line == 0): #print(line) tree = ET.fromstring(line) for elem in tree.iter('l'): s_word = elem.attrib.get('t') gram = ['', 0] j = 0 for elem2 in elem.iter('g'): gram[j] = elem2.attrib.get('v') break gram[1] = int(dfgram.index[dfgram['name'] == gram[0]].tolist()[0]) #print(s_word,gram) s = pd.Series(data=[s_word, gram[0], gram[1]], index=dfcols) df_xml = df_xml.append(s, ignore_index=True) n_line += 1 n_line += 1 if n_line >= step: n_line = 0 fp.close() df_xml.index.name = 'idcorpus' t_end = timer() env.debug(1, [ 'CORPUS', 'Dictionary loaded:', 'time:', env.job_time(t_start, t_end) ]) if persistent: filename_csv = env.filename_dict_csv() env.debug(1, ['CORPUS', 'Write dictionary to CSV:', filename_csv]) df_xml.to_csv(filename_csv, encoding='utf-8') env.debug(1, ['CORPUS', 'Dictionary saved:', filename_csv]) return df_xml