Ejemplo n.º 1
0
    hdf5_path = os.path.join(result_dir, 'variational_parameters.h5')

    df_ngram = pd.read_csv(ngram_path)

    df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick')
    log_assignment_probs = ppi.get_log_assignment_probs(df_stick)

    df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'),
                          encoding='utf-8')
    encoder, decoder = edcode.df2coder(df_code)

    data_path = sys.argv[2]
    df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t')
    start_code = encoder['START']

    base = edcode.encode_data(
        df_data.base_DISC.map(lambda x: ','.join(list(x))), encoder)
    df_data[
        'base_log_prob'] = ppi.get_unnormalized_log_posterior_predict_prob_of_target(
            base, df_ngram, log_assignment_probs, n, start_code)

    sity = edcode.encode_data(
        df_data.sity_DISC.map(lambda x: ','.join(list(x))), encoder)
    df_data[
        'sity_log_prob'] = ppi.get_unnormalized_log_posterior_predict_prob_of_target(
            sity, df_ngram, log_assignment_probs, n, start_code)

    kity = edcode.encode_data(
        df_data.kity_DISC.map(lambda x: ','.join(list(x))), encoder)
    df_data[
        'kity_log_prob'] = ppi.get_unnormalized_log_posterior_predict_prob_of_target(
            kity, df_ngram, log_assignment_probs, n, start_code)
Ejemplo n.º 2
0
    result_dir = os.path.split(ngram_path)[0]
    hdf5_path = os.path.join(result_dir, 'variational_parameters.h5')

    df_ngram = pd.read_csv(ngram_path)

    df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick')
    log_assignment_probs = ppi.get_log_assignment_probs(df_stick)

    df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'),
                          encoding='utf-8')
    encoder, decoder = edcode.df2coder(df_code)

    data_path = sys.argv[2]
    df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t').fillna('')
    prefixes = edcode.encode_data(df_data.prefix,
                                  encoder,
                                  add_end_symbol=False)
    targets = edcode.encode_data(df_data.target, encoder, add_end_symbol=False)
    controls = edcode.encode_data(df_data.control,
                                  encoder,
                                  add_end_symbol=False)
    suffixes = edcode.encode_data(df_data.suffix, encoder)
    inventory = [
        code for value, code in encoder.items()
        if not value in ['END', 'START']
    ]
    start_code = encoder['START']

    log_probs = ppi.get_log_posterior_predict_prob_of_target_and_control(
        prefixes, targets, controls, suffixes, df_ngram, log_assignment_probs,
        n, start_code, inventory)
Ejemplo n.º 3
0
    n = int(ngram_path.split('gram')[0][-1])
    result_dir = os.path.split(ngram_path)[0]
    hdf5_path = os.path.join(result_dir, 'variational_parameters.h5')

    df_ngram = pd.read_csv(ngram_path)

    df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick')
    log_assignment_probs = ppi.get_log_assignment_probs(df_stick)

    df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'),
                          encoding='utf-8')
    encoder, decoder = edcode.df2coder(df_code)

    data_path = sys.argv[2]
    df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t')
    target_data = edcode.encode_data(df_data.target_word, encoder)
    control_data = edcode.encode_data(df_data.control_word, encoder)
    start_code = encoder['START']

    classification_target = ppi.posterior_predict_classification(
        target_data, df_ngram, log_assignment_probs, n, start_code)
    classification_control = ppi.posterior_predict_classification(
        control_data, df_ngram, log_assignment_probs, n, start_code)

    df_classification_target = pd.DataFrame(
        classification_target,
        columns=[('sublex_%i' % i)
                 for i in range(classification_target.shape[1])])
    df_classification_target['MAP_classification'] = np.argmax(
        classification_target, axis=1)
    df_classification_target['IPA'] = df_data.target_word.str.replace(',', '')
    df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick')
    log_assignment_probs = ppi.get_log_assignment_probs(df_stick)

    df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'),
                          encoding='utf-8')
    encoder, decoder = edcode.df2coder(df_code)

    data_path = sys.argv[2]
    df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t').fillna('')
    # prefixes = edcode.encode_data(df_data.prefix, encoder, add_end_symbol = False)
    # targets = edcode.encode_data(df_data.target_c + ',' + df_data.target_v, encoder, add_end_symbol = False)
    # suffixes = [(encoder['END'],)]*df_data.shape[0]
    # inventory = [code for value,code in encoder.iteritems() if not value in ['END', 'START']]
    # start_code = encoder['START']

    words = edcode.encode_data(df_data.word, encoder)
    start_code = encoder['START']

    # log_probs = ppi.get_log_posterior_predict_prob_of_target(prefixes, targets, suffixes, df_ngram, log_assignment_probs, n, start_code, inventory)
    unnormalized_log_probs = ppi.get_unnormalized_log_posterior_predict_prob_of_target(
        words, df_ngram, log_assignment_probs, n, start_code)

    # df_data['log_prob_target'] = log_probs
    df_data['unnormalized_log_prob_target'] = unnormalized_log_probs
    normalize_over_prefix(df_data, 'unnormalized_log_prob_target')

    # classification_probs = ppi.posterior_predict_classification(words, df_ngram, log_assignment_probs, n, start_code)
    # for sublex_id, class_probs in enumerate(classification_probs.T):
    # 	df_data.loc[:,'sublex_%i' % sublex_id] = class_probs

    datafile_root = os.path.splitext(os.path.split(data_path)[1])[0]
Ejemplo n.º 5
0
	result_dir = os.path.split(ngram_path)[0]
	hdf5_path = os.path.join(result_dir, 'variational_parameters.h5')

	df_ngram = pd.read_csv(ngram_path)

	df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick')
	log_assignment_probs = ppi.get_log_assignment_probs(df_stick)
	

	df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'), encoding='utf-8')
	encoder,decoder = edcode.df2coder(df_code)


	data_path = sys.argv[2]
	df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t')
	target_data = edcode.encode_data(df_data.IPA_csv, encoder)
	start_code = encoder['START']



	classification_target = ppi.posterior_predict_classification(target_data, df_ngram, log_assignment_probs, n, start_code)

	df_classification = pd.DataFrame(classification_target, columns=[('sublex_%i' % i) for i in range(classification_target.shape[1])])
	df_classification['MAP_classification'] = np.argmax(classification_target, axis=1)
	# df_classification['katakana'] = df_data.katakana
	# df_classification['orthography'] = df_data.orthography
	# df_classification['wType'] = df_data.wType



	datafile_root = os.path.splitext(os.path.split(data_path)[1])[0]
	hdf5_path = os.path.join(result_dir, 'variational_parameters.h5')

	df_ngram = pd.read_csv(ngram_path)

	df_stick = pd.read_hdf(hdf5_path, key='/sublex/stick')
	log_assignment_probs = ppi.get_log_assignment_probs(df_stick)
	

	df_code = pd.read_csv(os.path.join(result_dir, 'symbol_coding.csv'), encoding='utf-8')
	encoder,decoder = edcode.df2coder(df_code)


	data_path = sys.argv[2]
	df_data = pd.read_csv(data_path, encoding='utf-8', sep='\t')
	df_data = df_data.drop_duplicates(subset = ['prefix']).reset_index(drop=True)
	data = edcode.encode_data(df_data.prefix, encoder, add_end_symbol = False)
	start_code = encoder['START']

	classification = ppi.posterior_predict_classification(data, df_ngram, log_assignment_probs, n, start_code)


	df_classification = pd.DataFrame(classification, columns=[('sublex_%i' % i) for i in range(classification.shape[1])])
	df_classification['MAP_classification'] = np.argmax(classification, axis=1)
	df_classification = pd.concat([df_classification, df_data], axis=1)
	df_classification['IPA'] = df_data.word.str.replace(',','')
	df_classification['prefix'] = df_data.prefix.str.replace(',','')
	# df_classification['stimulus_type'] = 'target'
	# df_classification['experimenter'] = df_data.experimenter
	# df_classification['group_identifier'] = df_data.group_identifier
	# df_classification['actual_sublex'] = df_data.actual_sublex
	df_classification.drop(columns='word', inplace=True)