Beispiel #1
0
if __name__ == '__main__':

    ngram_path = sys.argv[1]
    n = int(ngram_path.split('gram')[0][-1])
    result_dir = os.path.split(ngram_path)[0]
    hdf5_path = os.path.join(result_dir, 'variational_parameters.h5')

    df_ngram = pd.read_csv(ngram_path)

    df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick')
    log_assignment_probs = ppi.get_log_assignment_probs(df_stick)

    df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'),
                          encoding='utf-8')
    encoder, decoder = edcode.df2coder(df_code)

    data_path = sys.argv[2]
    df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t')
    start_code = encoder['START']

    base = edcode.encode_data(
        df_data.base_DISC.map(lambda x: ','.join(list(x))), encoder)
    df_data[
        'base_log_prob'] = ppi.get_unnormalized_log_posterior_predict_prob_of_target(
            base, df_ngram, log_assignment_probs, n, start_code)

    sity = edcode.encode_data(
        df_data.sity_DISC.map(lambda x: ','.join(list(x))), encoder)
    df_data[
        'sity_log_prob'] = ppi.get_unnormalized_log_posterior_predict_prob_of_target(
Beispiel #2
0
                        type=int,
                        help='Length of substrings to rank.')
    parser.add_argument('top_k', type=int, help='Length of the ranking.')
    parser.add_argument(
        '-f',
        '--frequency_csv',
        type=str,
        default=None,
        help=
        'Path to the csv file containing frequency info. If specified, the ranking will be limited to substrings with positive frequency in the file.'
    )
    args = parser.parse_args()

    df_code = pd.read_csv(os.path.join(args.result_dir, 'symbol_coding.csv'),
                          encoding='utf-8')
    encoder, decoder = encode_decode.df2coder(df_code)

    df_like = pd.read_csv(args.likelihood_csv, encoding='utf-8')
    string_cols = sorted([
        col for col in df_like.columns.tolist() if col.startswith('symbol_')
    ])[-args.string_length:]
    df_like = df_like.groupby(string_cols + ['sublex']).sum().reset_index()
    df_like['log_like'] = df_like.prob.map(np.ma.log)

    if not args.frequency_csv is None:
        df_freq = pd.read_csv(args.frequency_csv, encoding='utf-8')
        df_freq = df_freq.rename(columns={
            'value': string_cols[-1],
            'sublex_id': 'sublex'
        })
        df_freq[string_cols[-1]] = df_freq[string_cols[-1]].map(