Example #1
0
 def vocabulary_from_corpus(self, n_min=1, n_max=10, persistent=True):
     env = Environment()
     df_voc = pd.DataFrame()
     #dfgram = self.grammemes()
     for i in range(n_min, n_max + 1):
         file_csv = env.filename_corpus_csv(i)
         try:
             dffile = pd.read_csv(file_csv,
                                  index_col='idcorpus',
                                  encoding='utf-8')
         except:
             env.debug(1, ['Failed to read corpus file:', file_csv])
         else:
             env.debug(1, ['Read OK:', file_csv])
             if not dffile.empty:
                 df_voc = df_voc.append(dffile)
     df_voc = df_voc.drop_duplicates()
     df_voc.columns = ['word', 'gram', 'idgram']
     df_voc = df_voc.reset_index(drop=True)
     df_voc.index.name = 'idcorpus'
     if persistent:
         file_voc = env.filename_vocabulary_csv()
         env.debug(1, ['Write vocabulary to CSV:', file_voc])
         df_voc.to_csv(file_voc, encoding='utf-8')
     return df_voc
Example #2
0
 def vocabulary(self):
     env = Environment()
     file_voc = env.filename_vocabulary_csv()  #from vocabulary file
     file_dict = env.filename_dict_csv()  #from dictionary file
     try:
         df_voc = pd.read_csv(file_voc,
                              index_col='idcorpus',
                              encoding='utf-8')
     except:
         env.debug(1, ['Failed to read vocabulary file:', file_voc])
     else:
         env.debug(1, ['Read vocabulary OK:', file_voc])
     try:
         df_dict = pd.read_csv(file_dict,
                               index_col='idcorpus',
                               encoding='utf-8')
     except:
         env.debug(1, ['Failed to read dictionary file:', file_dict])
     else:
         env.debug(1, ['Read dictionary OK:', file_dict])
     #Concat
     df_res = pd.concat([df_voc, df_dict])
     df_res = df_res.drop_duplicates()
     #Apply patch words
     df_patch = pd.read_csv(env.filename_vocabulary_patch_csv(),
                            index_col='idcorpus',
                            encoding='utf-8')
     df_res = df_res.drop(df_res[df_res['word'].isin(
         df_patch['word'])].index,
                          axis=0)
     df_res = pd.concat([df_res, df_patch])
     #print(df_res[df_res['word'].isin(df_patch['word'])])
     df_res = df_res.reset_index(drop=True)
     df_res.index.name = 'idcorpus'
     #print(df_res)
     return df_res