def binarize(features, binarizers):
    assert (list_of_lists(features))
    num_features = len(features[0])
    #    if binarizers != {} and max(binarizers.keys()) >= num_features:
    #        print("Binarizers keys max: ", max(binarizers.keys()))
    #        print("Total feature number: ", num_features)
    #        print("Features:", features[0])
    assert (binarizers == {} or max(binarizers.keys()) < num_features)

    binarized_cols = []
    for i in range(num_features):
        # get this column
        cur_values = [f[i] for f in features]
        # if there's a binarizer for this column
        if i in binarizers:
            binarizer = binarizers[i]
            if type(binarizer) == LabelBinarizer:
                try:
                    binarized_cols.append(binarizer.transform(cur_values))
                except:
                    pass
#                    print(cur_values)
            elif type(binarizer) == MultiLabelBinarizer:
                assert (list_of_lists(cur_values))
                # MultiLabelBinarizer doesn't support unknown values -- they need to be replaced with a default value
                # we're going to use the empty list as the default value
                cur_values_default = []
                default_value = binarizer.classes_[-1]
                for a_list in cur_values:
                    new_list = list(a_list)
                    for j, val in enumerate(new_list):
                        if val not in binarizer.classes_:
                            new_list[j] = default_value
                    cur_values_default.append(tuple(new_list))

                transformed = binarizer.transform(cur_values_default)
                binarized_cols.append(transformed)
            else:
                raise NotImplementedError(
                    'this function is not implemented for type: {}'.format(
                        type(binarizer)))
        else:
            #            arr = np.array(cur_values)
            #            print(arr.shape)
            #            print(len(cur_values))
            #            print(cur_values)
            try:
                #                new_vals = np.array(cur_values).reshape(len(cur_values), 1)
                binarized_cols.append(
                    np.array(cur_values).reshape(len(cur_values), 1))
            except:
                print(cur_values)
                sys.exit()

    assert (
        len(binarized_cols) == num_features
    ), 'the number of columns after binarization must match the number of features'
    new_features = np.hstack(binarized_cols)

    return new_features
Example #2
0
def sequence_correlation(y_true, y_pred, good_label=1, bad_label=0, out='sequence_corr.out', verbose=False):
    assert(len(y_true) == len(y_pred))
    if not list_of_lists(y_true) and not list_of_lists(y_pred):
        logger.warning("You provided the labels in a flat list of length {}. Assuming them to be one sequence".format(len(y_true)))
        y_true = [y_true]
        y_pred = [y_pred]
    elif list_of_lists(y_true) and list_of_lists(y_pred):
        pass
    else:
        logger.error("Shapes of the hypothesis and the reference don't match")
        return 0

    sentence_pred = []
    if verbose:
        out_file = open(out, 'w')
    for true_sent, pred_sent in zip(y_true, y_pred):
        assert(len(true_sent) == len(pred_sent))
        true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label)
        pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label)

        res_1 = intersect_spans(true_spans_1, pred_spans_1)
        res_0 = intersect_spans(true_spans_0, pred_spans_0)

        corr_val = (res_1+res_0)/float(len(true_sent))
#        print(corr_val, type(corr_val))
        if verbose:
            out_file.write("Reference:  %s\nPrediction: %s\nCorrelation: %s\n" % (' '.join([str(t) for t in true_sent]), ' '.join([str(t) for t in pred_sent]), str(corr_val)))
        sentence_pred.append(corr_val)

    if verbose:
        out_file.close()
    return sentence_pred, np.average(sentence_pred)
Example #3
0
def sequence_correlation_weighted(y_true, y_pred, good_label=1, bad_label=0, out='sequence_corr.out', verbose=False):
    assert(len(y_true) == len(y_pred))
    if not list_of_lists(y_true) and not list_of_lists(y_pred):
        logger.warning("You provided the labels in a flat list of length {}. Assuming them to be one sequence".format(len(y_true)))
        y_true = [y_true]
        y_pred = [y_pred]
    elif list_of_lists(y_true) and list_of_lists(y_pred):
        pass
    else:
        logger.error("Shapes of the hypothesis and the reference don't match")
        return 0

    sentence_pred = []
    if verbose:
        out_file = open(out, 'w')
    for true_sent, pred_sent in zip(y_true, y_pred):
        ref_bad = sum([1 for l in true_sent if l == bad_label])
        ref_good = sum([1 for l in true_sent if l == good_label])
        assert(ref_bad + ref_good == len(true_sent))
        # coefficients that ensure the equal influence of good and bad classes on the overall score
        try:
            coeff_bad = len(true_sent)/(2*ref_bad)
        except ZeroDivisionError:
            coeff_bad = 0.0
        try:
            coeff_good = len(true_sent)/(2*ref_good)
        except ZeroDivisionError:
            coeff_good = 0.0

        assert(len(true_sent) == len(pred_sent))
        true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label)
        pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label)

        res_1 = intersect_spans(true_spans_1, pred_spans_1)
        res_0 = intersect_spans(true_spans_0, pred_spans_0)

        len_t_1, len_t_0 = len(true_spans_1), len(true_spans_0)
        len_p_1, len_p_0 = len(pred_spans_1), len(pred_spans_0)
        if len_t_1 + len_t_0 > len_p_1 + len_p_0:
            spans_ratio = (len_p_1 + len_p_0)/(len_t_1 + len_t_0)
        else:
            spans_ratio = (len_t_1 + len_t_0)/(len_p_1 + len_p_0)

        corr_val = (res_1*coeff_good + res_0*coeff_bad)*spans_ratio/float(len(true_sent))
#        try:
#            corr_val = res_0/float(ref_bad)
#        except ZeroDivisionError:
#            corr_val = 1.0
#        print(corr_val, type(corr_val))
        if verbose:
            out_file.write("Reference:  %s\nPrediction: %s\nCorrelation: %s\n" % (' '.join([str(t) for t in true_sent]), ' '.join([str(t) for t in pred_sent]), str(corr_val)))
        sentence_pred.append(corr_val)

    if verbose:
        out_file.close()
    return sentence_pred, np.average(sentence_pred)
def binarize(features, binarizers):
    assert(list_of_lists(features))
    num_features = len(features[0])
#    if binarizers != {} and max(binarizers.keys()) >= num_features:     
#        print("Binarizers keys max: ", max(binarizers.keys()))
#        print("Total feature number: ", num_features)
#        print("Features:", features[0])
    assert(binarizers == {} or max(binarizers.keys()) < num_features)

    binarized_cols = []
    for i in range(num_features):
        # get this column
        cur_values = [f[i] for f in features]
        # if there's a binarizer for this column
        if i in binarizers:
            binarizer = binarizers[i]
            if type(binarizer) == LabelBinarizer:
                try:
                    binarized_cols.append(binarizer.transform(cur_values))
                except:
                    pass
#                    print(cur_values)
            elif type(binarizer) == MultiLabelBinarizer:
                assert(list_of_lists(cur_values))
                # MultiLabelBinarizer doesn't support unknown values -- they need to be replaced with a default value
                # we're going to use the empty list as the default value
                cur_values_default = []
                default_value = binarizer.classes_[-1]
                for a_list in cur_values:
                    new_list = list(a_list)
                    for j, val in enumerate(new_list):
                        if val not in binarizer.classes_:
                            new_list[j] = default_value
                    cur_values_default.append(tuple(new_list))

                transformed = binarizer.transform(cur_values_default)
                binarized_cols.append(transformed)
            else:
                raise NotImplementedError('this function is not implemented for type: {}'.format(type(binarizer)))
        else:
#            arr = np.array(cur_values)
#            print(arr.shape)
#            print(len(cur_values))
#            print(cur_values)
            try:
#                new_vals = np.array(cur_values).reshape(len(cur_values), 1)
                binarized_cols.append(np.array(cur_values).reshape(len(cur_values), 1))
            except:
                print(cur_values)
                sys.exit()

    assert (len(binarized_cols) == num_features), 'the number of columns after binarization must match the number of features'
    new_features = np.hstack(binarized_cols)

    return new_features
Example #5
0
def sequence_correlation(y_true,
                         y_pred,
                         good_label=1,
                         bad_label=0,
                         out='sequence_corr.out',
                         verbose=False):
    assert (len(y_true) == len(y_pred))
    if not list_of_lists(y_true) and not list_of_lists(y_pred):
        logger.warning(
            "You provided the labels in a flat list of length {}. Assuming them to be one sequence"
            .format(len(y_true)))
        y_true = [y_true]
        y_pred = [y_pred]
    elif list_of_lists(y_true) and list_of_lists(y_pred):
        pass
    else:
        logger.error("Shapes of the hypothesis and the reference don't match")
        return 0

    sentence_pred = []
    if verbose:
        out_file = open(out, 'w')
    for true_sent, pred_sent in zip(y_true, y_pred):
        assert (len(true_sent) == len(pred_sent))
        true_spans_1, true_spans_0 = get_spans(true_sent,
                                               good_label=good_label,
                                               bad_label=bad_label)
        pred_spans_1, pred_spans_0 = get_spans(pred_sent,
                                               good_label=good_label,
                                               bad_label=bad_label)

        res_1 = intersect_spans(true_spans_1, pred_spans_1)
        res_0 = intersect_spans(true_spans_0, pred_spans_0)

        corr_val = (res_1 + res_0) / float(len(true_sent))
        #        print(corr_val, type(corr_val))
        if verbose:
            out_file.write(
                "Reference:  %s\nPrediction: %s\nCorrelation: %s\n" %
                (' '.join([str(t) for t in true_sent]), ' '.join(
                    [str(t) for t in pred_sent]), str(corr_val)))
        sentence_pred.append(corr_val)

    if verbose:
        out_file.close()
    return sentence_pred, np.average(sentence_pred)
Example #6
0
def sequence_correlation(y_true, y_pred, good_label=1, bad_label=0):
    assert(len(y_true) == len(y_pred))
    if not list_of_lists(y_true) and not list_of_lists(y_pred):
        logger.warning("You provided the labels in a flat list of length {}. Assuming them to be one sequence".format(len(y_true)))
        y_true = [y_true]
        y_pred = [y_pred]
    elif list_of_lists(y_true) and list_of_lists(y_pred):
        pass
    else:
        logger.error("Shapes of the hypothesis and the reference don't match")
        return 0

    sentence_pred = []
    for true_sent, pred_sent in zip(y_true, y_pred):
        assert(len(true_sent) == len(pred_sent))
        true_spans_1, true_spans_0 = get_spans(true_sent, good_label=good_label, bad_label=bad_label)
        pred_spans_1, pred_spans_0 = get_spans(pred_sent, good_label=good_label, bad_label=bad_label)

        res_1 = intersect_spans(true_spans_1, pred_spans_1)
        res_0 = intersect_spans(true_spans_0, pred_spans_0)

        sentence_pred.append((res_1+res_0)/len(true_sent))

    return sentence_pred, np.average(sentence_pred)
def flatten(lofl):
    if list_of_lists(lofl):
        return [item for sublist in lofl for item in sublist]
    elif type(lofl) == dict:
        return lofl.values()
Example #8
0
def sequence_correlation_weighted(y_true,
                                  y_pred,
                                  good_label=1,
                                  bad_label=0,
                                  out='sequence_corr.out',
                                  verbose=False):
    assert (len(y_true) == len(y_pred))
    if not list_of_lists(y_true) and not list_of_lists(y_pred):
        logger.warning(
            "You provided the labels in a flat list of length {}. Assuming them to be one sequence"
            .format(len(y_true)))
        y_true = [y_true]
        y_pred = [y_pred]
    elif list_of_lists(y_true) and list_of_lists(y_pred):
        pass
    else:
        logger.error("Shapes of the hypothesis and the reference don't match")
        return 0

    sentence_pred = []
    if verbose:
        out_file = open(out, 'w')
    for true_sent, pred_sent in zip(y_true, y_pred):
        ref_bad = sum([1 for l in true_sent if l == bad_label])
        ref_good = sum([1 for l in true_sent if l == good_label])
        assert (ref_bad + ref_good == len(true_sent))
        # coefficients that ensure the equal influence of good and bad classes on the overall score
        try:
            coeff_bad = len(true_sent) / (2 * ref_bad)
        except ZeroDivisionError:
            coeff_bad = 0.0
        try:
            coeff_good = len(true_sent) / (2 * ref_good)
        except ZeroDivisionError:
            coeff_good = 0.0

        assert (len(true_sent) == len(pred_sent))
        true_spans_1, true_spans_0 = get_spans(true_sent,
                                               good_label=good_label,
                                               bad_label=bad_label)
        pred_spans_1, pred_spans_0 = get_spans(pred_sent,
                                               good_label=good_label,
                                               bad_label=bad_label)

        res_1 = intersect_spans(true_spans_1, pred_spans_1)
        res_0 = intersect_spans(true_spans_0, pred_spans_0)

        len_t_1, len_t_0 = len(true_spans_1), len(true_spans_0)
        len_p_1, len_p_0 = len(pred_spans_1), len(pred_spans_0)
        if len_t_1 + len_t_0 > len_p_1 + len_p_0:
            spans_ratio = (len_p_1 + len_p_0) / (len_t_1 + len_t_0)
        else:
            spans_ratio = (len_t_1 + len_t_0) / (len_p_1 + len_p_0)

        corr_val = (res_1 * coeff_good +
                    res_0 * coeff_bad) * spans_ratio / float(len(true_sent))
        #        try:
        #            corr_val = res_0/float(ref_bad)
        #        except ZeroDivisionError:
        #            corr_val = 1.0
        #        print(corr_val, type(corr_val))
        if verbose:
            out_file.write(
                "Reference:  %s\nPrediction: %s\nCorrelation: %s\n" %
                (' '.join([str(t) for t in true_sent]), ' '.join(
                    [str(t) for t in pred_sent]), str(corr_val)))
        sentence_pred.append(corr_val)

    if verbose:
        out_file.close()
    return sentence_pred, np.average(sentence_pred)
Example #9
0
def persist_features(dataset_name,
                     features,
                     persist_dir,
                     tags=None,
                     feature_names=None,
                     phrase_lengths=None,
                     file_format='crf_suite'):
    '''
    persist the features to persist_dir -- use dataset_name as the prefix for the persisted files
    :param dataset_name: prefix of the output file
    :param features: dataset
    :param persist_dir: directory of output file(s)
    :param tags: tags for the dataset
    :param feature_names: names of features in the dataset
    :param file_format: format of the output file for sequences. Values -- 'crf++', 'crf_suite', 'svm_light'
    :return:
    '''
    try:
        os.makedirs(persist_dir)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(persist_dir):
            pass
        else:
            raise

    if file_format == 'crf_suite' and feature_names is None:
        print(
            "Feature names are required to save features in CRFSuite and SVMLight formats"
        )
        return
    # for the 'plain' datatype
    if type(features) == np.ndarray and features.shape[1] == len(
            feature_names):
        output_df = pd.DataFrame(data=features, columns=feature_names)
        output_path = os.path.join(persist_dir, dataset_name + '.csv')
        output_df.to_csv(output_path, index=False)
        logger.info('saved features in: {} to file: {}'.format(
            dataset_name, output_path))

    # for the 'sequential' datatype
    elif list_of_lists(features):
        if file_format == 'svm_light':
            feature_names = range(1, len(features[0]) + 1)
            output_path = os.path.join(persist_dir, dataset_name + '.svm')
            output = open(output_path, 'w')
            tags_map = {'OK': '+1', 'BAD': '-1'}
            for a_tag, feat_seq in zip(tags, features):
                feat_list = []
                for f_name, f_val in zip(feature_names, feat_seq):
                    try:
                        if float(f_val) != 0.0:
                            feat_list.append(
                                str(f_name) + ':' + val_to_str(f_val))
                    except ValueError:
                        feat_list.append(str(f_name) + ':' + val_to_str(f_val))
                output.write("%s %s\n" %
                             (tags_map[a_tag], ' '.join(feat_list)))
            return
        output_path = os.path.join(persist_dir, dataset_name + '.crf')
        output = open(output_path, 'w')
        if tags is not None:
            assert (len(features) == len(tags)
                    ), "Different numbers of tag and feature sequences"
            for s_idx, (seq, tag_seq) in enumerate(zip(features, tags)):
                assert (
                    len(seq) == len(tag_seq)
                ), "Lengths of tag and feature sequences don't match in sequence {}: {} and {} ({} and {})".format(
                    s_idx, len(seq), len(tag_seq), seq, tag_seq)
                for w_idx, (feature_list, tag) in enumerate(zip(seq, tag_seq)):
                    if len(feature_list) != len(feature_names):
                        print(feature_list)
                        print(feature_names)
                        sys.exit()
                    tag = str(tag)
                    feature_str = []
                    for f in feature_list:
                        if type(f) == unicode:
                            feature_str.append(f.encode('utf-8'))
#                        else:
#                            feature_str.append(str(f))
                        else:
                            feature_str.append(f)
                    if file_format == 'crf++':
                        feature_str = '\t'.join([str(f) for f in feature_str])
                        output.write('%s\t%s\n' % (feature_str, tag))
                    elif file_format == 'crf_suite':
                        feature_str_all = []
                        for i in range(len(feature_str)):
                            #                            if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)):
                            #                                feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i]))
                            #                            else:
                            feature_str_all.append(feature_names[i] + '=' +
                                                   str(feature_str[i]))
#                        feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))]
                        feature_str = '\t'.join(feature_str_all)
                        output.write("%s\t%s\n" % (tag, feature_str))
                    else:
                        print("Unknown data format:", file_format)
                        return False
                output.write("\n")
        else:
            for s_idx, seq in enumerate(features):
                for w_idx, feature_list in enumerate(seq):
                    #assert(len(seq) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx)
                    feature_str = []
                    for f in feature_list:
                        if type(f) == unicode:
                            feature_str.append(f.encode('utf-8'))
#                        else:
#                            feature_str.append(str(f))
                        else:
                            feature_str.append(f)
                    if file_format == 'crf++':
                        feature_str = '\t'.join([str(f) for f in feature_str])
                    elif file_format == 'crf_suite':
                        #                        feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))]
                        feature_str_all = []
                        for i in range(len(feature_str)):
                            #                            if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)):
                            #                                feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i]))
                            #                            else:
                            feature_str_all.append(feature_names[i] + '=' +
                                                   str(feature_str[i]))
                        feature_str = '\t'.join(feature_str_all)
                    else:
                        print("Unknown data format:", file_format)
                        return False
                    output.write("%s\n" % feature_str)
                output.write("\n")
        if feature_names is not None:
            output_features = open(
                os.path.join(persist_dir, dataset_name + '.features'), 'w')
            for f_name in feature_names:
                output_features.write("%s\n" % f_name.encode('utf-8'))
            output_features.close()
        output.close()

        # write phrase lengths
        if phrase_lengths is not None:
            write_lofl(
                phrase_lengths,
                os.path.join(persist_dir, dataset_name + '.phrase-lengths'))

        # generate CRF++ template
        if file_format == 'crf++':
            feature_num = len(features[0][0])
            generate_crf_template(feature_num, tmp_dir=persist_dir)
    return output_path
Example #10
0
def flatten(lofl):
    if list_of_lists(lofl):
        return [item for sublist in lofl for item in sublist]
    elif type(lofl) == dict:
        return lofl.values()
Example #11
0
def persist_features(dataset_name, features, persist_dir, tags=None, feature_names=None, phrase_lengths=None, file_format='crf_suite'):
    '''
    persist the features to persist_dir -- use dataset_name as the prefix for the persisted files
    :param dataset_name: prefix of the output file
    :param features: dataset
    :param persist_dir: directory of output file(s)
    :param tags: tags for the dataset
    :param feature_names: names of features in the dataset
    :param file_format: format of the output file for sequences. Values -- 'crf++', 'crf_suite', 'svm_light'
    :return:
    '''
    try:
        os.makedirs(persist_dir)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(persist_dir):
            pass
        else:
            raise

    if file_format == 'crf_suite' and feature_names is None:
        print("Feature names are required to save features in CRFSuite and SVMLight formats")
        return
    # for the 'plain' datatype
    if type(features) == np.ndarray and features.shape[1] == len(feature_names):
        output_df = pd.DataFrame(data=features, columns=feature_names)
        output_path = os.path.join(persist_dir, dataset_name + '.csv')
        output_df.to_csv(output_path, index=False)
        logger.info('saved features in: {} to file: {}'.format(dataset_name, output_path))

    # for the 'sequential' datatype
    elif list_of_lists(features):
        if file_format == 'svm_light':
            feature_names = range(1, len(features[0]) + 1)
            output_path = os.path.join(persist_dir, dataset_name + '.svm')
            output = open(output_path, 'w')
            tags_map = {'OK': '+1', 'BAD': '-1'}
            for a_tag, feat_seq in zip(tags, features):
                feat_list = []
                for f_name, f_val in zip(feature_names, feat_seq):
                    try:
                        if float(f_val) != 0.0:
                            feat_list.append(str(f_name) + ':' + val_to_str(f_val))
                    except ValueError:
                        feat_list.append(str(f_name) + ':' + val_to_str(f_val))
                output.write("%s %s\n" % (tags_map[a_tag], ' '.join(feat_list)))
            return
        output_path = os.path.join(persist_dir, dataset_name + '.crf')
        output = open(output_path, 'w')
        if tags is not None:
            assert(len(features) == len(tags)), "Different numbers of tag and feature sequences"
            for s_idx, (seq, tag_seq) in enumerate(zip(features, tags)):
                assert(len(seq) == len(tag_seq)), "Lengths of tag and feature sequences don't match in sequence {}: {} and {} ({} and {})".format(s_idx, len(seq), len(tag_seq), seq, tag_seq)
                for w_idx, (feature_list, tag) in enumerate(zip(seq, tag_seq)):
                    if len(feature_list) != len(feature_names):
                        print(feature_list)
                        print(feature_names)
                        sys.exit()
                    tag = str(tag)
                    feature_str = []
                    for f in feature_list:
                        if type(f) == unicode:
                            feature_str.append(f.encode('utf-8'))
#                        else:
#                            feature_str.append(str(f))
                        else:
                            feature_str.append(f)
                    if file_format == 'crf++':
                        feature_str = '\t'.join([str(f) for f in feature_str])
                        output.write('%s\t%s\n' % (feature_str, tag))
                    elif file_format == 'crf_suite':
                        feature_str_all = []
                        for i in range(len(feature_str)):
#                            if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)):
#                                feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i]))
#                            else:
                            feature_str_all.append(feature_names[i] + '=' + str(feature_str[i]))
#                        feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))]
                        feature_str = '\t'.join(feature_str_all)
                        output.write("%s\t%s\n" % (tag, feature_str))
                    else:
                        print("Unknown data format:", file_format)
                        return False
                output.write("\n")
        else:
            for s_idx, seq in enumerate(features):
                for w_idx, feature_list in enumerate(seq):
                    #assert(len(seq) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx)
                    feature_str = []
                    for f in feature_list:
                        if type(f) == unicode:
                            feature_str.append(f.encode('utf-8'))
#                        else:
#                            feature_str.append(str(f))
                        else:
                            feature_str.append(f)
                    if file_format == 'crf++':
                        feature_str = '\t'.join([str(f) for f in feature_str])
                    elif file_format == 'crf_suite':
#                        feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))]
                        feature_str_all = []
                        for i in range(len(feature_str)):
#                            if isinstance(feature_str[i], (int, float, np.float32, np.float64, np.int32, np.int64)):
#                                feature_str_all.append(feature_names[i] + '=1:' + str(feature_str[i]))
#                            else:
                            feature_str_all.append(feature_names[i] + '=' + str(feature_str[i]))
                        feature_str = '\t'.join(feature_str_all)
                    else:
                        print("Unknown data format:", file_format)
                        return False
                    output.write("%s\n" % feature_str)
                output.write("\n")
        if feature_names is not None:
            output_features = open(os.path.join(persist_dir, dataset_name + '.features'), 'w')
            for f_name in feature_names:
                output_features.write("%s\n" % f_name.encode('utf-8'))
            output_features.close()
        output.close()

        # write phrase lengths
        if phrase_lengths is not None:
            write_lofl(phrase_lengths, os.path.join(persist_dir, dataset_name + '.phrase-lengths'))

        # generate CRF++ template
        if file_format == 'crf++':
            feature_num = len(features[0][0])
            generate_crf_template(feature_num, tmp_dir=persist_dir)
    return output_path
Example #12
0
def persist_features(dataset_name, features, persist_dir, tags=None, feature_names=None, file_format='crf++'):
    '''
    persist the features to persist_dir -- use dataset_name as the prefix for the persisted files
    :param dataset_name: prefix of the output file
    :param features: dataset
    :param persist_dir: directory of output file(s)
    :param tags: tags for the dataset
    :param feature_names: names of features in the dataset
    :param file_format: format of the output file for sequences. Values -- 'crf++' or 'crf_suite'
    :return: 
    '''
    try:
        os.makedirs(persist_dir)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(persist_dir):
            pass
        else:
            raise

    # for the 'plain' datatype
    if type(features) == np.ndarray and features.shape[1] == len(feature_names):
        output_df = pd.DataFrame(data=features, columns=feature_names)
        output_path = os.path.join(persist_dir, dataset_name + '.csv')
        output_df.to_csv(output_path, index=False)
        logger.info('saved features in: {} to file: {}'.format(dataset_name, output_path))

    # for the 'sequential' datatype
    elif list_of_lists(features):
        output_path = os.path.join(persist_dir, dataset_name + '.crf')
        output = open(output_path, 'w')
        if tags is not None:
            assert(len(features) == len(tags)), "Different numbers of tag and feature sequences"
            for s_idx, (seq, tag_seq) in enumerate(zip(features, tags)):
                assert(len(seq) == len(tag_seq)), "Lengths of tag and feature sequences don't match in sequence %d" % s_idx
                for w_idx, (feature_list, tag) in enumerate(zip(seq, tag_seq)):
                    assert(len(feature_list) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx)
                    tag = str(tag)
                    feature_str = []
                    for f in feature_list:
                        if type(f) == unicode:
                            feature_str.append(f.encode('utf-8'))
                        else:
                            feature_str.append(str(f))
                    if file_format == 'crf++':
                        feature_str = '\t'.join(feature_str)
                        output.write('%s\t%s\n' % (feature_str, tag))
                    elif file_format =='crf_suite':
                        feature_str = [feature_names[i] + '=' + feature_str[i] for i in range(len(feature_str))]
                        feature_str = '\t'.join(feature_str)
                        output.write("%s\t%s\n" % (tag, feature_str))
                    else:
                        print("Unknown data format:", file_format)
                        return False
                output.write("\n")
        else:
            for s_idx, seq in enumerate(features):
                for w_idx, feature_list in enumerate(seq):
                    assert(len(seq) == len(feature_names)), "Wrong number of features in sequence %d, word %d" % (s_idx, w_idx)
                    feature_str = []
                    for f in feature_list:
                        if type(f) == unicode:
                            feature_str.append(f.encode('utf-8'))
                        else:
                            feature_str.append(str(f))
                    if file_format == 'crf++':
                        feature_str = '\t'.join(feature_str)
                    elif file_format =='crf_suite':
                        feature_str = [feature_name[i] + '=' + feature_str[i] for i in range(len(feature_str))]
                        feature_str = '\t'.join(feature_str)
                    else:
                        print("Unknown data format:", file_format)
                        return False
                    output.write("%s\n" % feature_str)
                output.write("\n")
        output_features = open(os.path.join(persist_dir, dataset_name + '.features'), 'w')
        for f_name in feature_names:
            output_features.write("%s\n" % f_name.encode('utf-8'))
        output.close()
        output_features.close()