if __name__ == '__main__': ngram_path = sys.argv[1] n = int(ngram_path.split('gram')[0][-1]) result_dir = os.path.split(ngram_path)[0] hdf5_path = os.path.join(result_dir, 'variational_parameters.h5') df_ngram = pd.read_csv(ngram_path) df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick') log_assignment_probs = ppi.get_log_assignment_probs(df_stick) df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'), encoding='utf-8') encoder, decoder = edcode.df2coder(df_code) data_path = sys.argv[2] df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t') start_code = encoder['START'] base = edcode.encode_data( df_data.base_DISC.map(lambda x: ','.join(list(x))), encoder) df_data[ 'base_log_prob'] = ppi.get_unnormalized_log_posterior_predict_prob_of_target( base, df_ngram, log_assignment_probs, n, start_code) sity = edcode.encode_data( df_data.sity_DISC.map(lambda x: ','.join(list(x))), encoder) df_data[ 'sity_log_prob'] = ppi.get_unnormalized_log_posterior_predict_prob_of_target(
type=int, help='Length of substrings to rank.') parser.add_argument('top_k', type=int, help='Length of the ranking.') parser.add_argument( '-f', '--frequency_csv', type=str, default=None, help= 'Path to the csv file containing frequency info. If specified, the ranking will be limited to substrings with positive frequency in the file.' ) args = parser.parse_args() df_code = pd.read_csv(os.path.join(args.result_dir, 'symbol_coding.csv'), encoding='utf-8') encoder, decoder = encode_decode.df2coder(df_code) df_like = pd.read_csv(args.likelihood_csv, encoding='utf-8') string_cols = sorted([ col for col in df_like.columns.tolist() if col.startswith('symbol_') ])[-args.string_length:] df_like = df_like.groupby(string_cols + ['sublex']).sum().reset_index() df_like['log_like'] = df_like.prob.map(np.ma.log) if not args.frequency_csv is None: df_freq = pd.read_csv(args.frequency_csv, encoding='utf-8') df_freq = df_freq.rename(columns={ 'value': string_cols[-1], 'sublex_id': 'sublex' }) df_freq[string_cols[-1]] = df_freq[string_cols[-1]].map(