def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True): env = Environment() df_voc = pd.DataFrame() #dfgram = self.grammemes() for i in range(n_min, n_max + 1): file_csv = env.filename_corpus_csv(i) try: dffile = pd.read_csv(file_csv, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read corpus file:', file_csv]) else: env.debug(1, ['Read OK:', file_csv]) if not dffile.empty: df_voc = df_voc.append(dffile) df_voc = df_voc.drop_duplicates() df_voc.columns = ['word', 'gram', 'idgram'] df_voc = df_voc.reset_index(drop=True) df_voc.index.name = 'idcorpus' if persistent: file_voc = env.filename_vocabulary_csv() env.debug(1, ['Write vocabulary to CSV:', file_voc]) df_voc.to_csv(file_voc, encoding='utf-8') return df_voc
def vocabulary(self): env = Environment() file_voc = env.filename_vocabulary_csv() #from vocabulary file file_dict = env.filename_dict_csv() #from dictionary file try: df_voc = pd.read_csv(file_voc, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read vocabulary file:', file_voc]) else: env.debug(1, ['Read vocabulary OK:', file_voc]) try: df_dict = pd.read_csv(file_dict, index_col='idcorpus', encoding='utf-8') except: env.debug(1, ['Failed to read dictionary file:', file_dict]) else: env.debug(1, ['Read dictionary OK:', file_dict]) #Concat df_res = pd.concat([df_voc, df_dict]) df_res = df_res.drop_duplicates() #Apply patch words df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(), index_col='idcorpus', encoding='utf-8') df_res = df_res.drop(df_res[df_res['word'].isin( df_patch['word'])].index, axis=0) df_res = pd.concat([df_res, df_patch]) #print(df_res[df_res['word'].isin(df_patch['word'])]) df_res = df_res.reset_index(drop=True) df_res.index.name = 'idcorpus' #print(df_res) return df_res