def create_feature(data_file):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    question1_vectors, question2_vectors = sentence2vec(data_file)

    print(sys.argv[0], data_file, file=sys.stderr)
    df = pd.DataFrame()
    column_name = 'f{0}'.format(os.path.basename(feature_output_file(data_file)).split('_')[0])
    df[column_name] = np.nan_to_num([kurtosis(x) for x in np.nan_to_num(question2_vectors)])

    df[[column_name]].to_csv(feature_output_file(data_file), index=False, float_format='%.5f')
Ejemplo n.º 2
0
def create_feature(data_file, model: gensim.models.KeyedVectors):
    if os.path.exists(feature_output_file(data_file)):
        print('File exists {}.'.format(feature_output_file(data_file)))
        return

    question1_vectors, question2_vectors = sentence2vec(data_file)

    print(sys.argv[0], data_file, file=sys.stderr)
    df = pd.DataFrame()
    column_name = 'f{0}'.format(
        os.path.basename(feature_output_file(data_file)).split('_')[0])
    values = []
    df[column_name] = np.nan_to_num([
        jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                        np.nan_to_num(question2_vectors))
    ])

    df[[column_name]].to_csv(feature_output_file(data_file),
                             index=False,
                             float_format='%.5f')