Esempio n. 1
0
    def load_get_coefficients(config_learning, config_data):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "coefficients.txt", "w")

        feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data)
        combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        learning_config = config_learning.get("learning", None)
        method_name = learning_config.get("method", None)

        estimator = joblib.load(os.path.expanduser(config_data.get("Learner", "models")) + "/" + method_name + ".pkl")
        coefficients = estimator.coef_

        feature_list = []
        for i, feature_name in enumerate(feature_names):
            if combination_methods[i] == 'both':
                feature_list.append(feature_name)
                feature_list.append(feature_name)
            else:
                feature_list.append(feature_name)

        for i, name in enumerate(feature_list):
            output.write(name + "\t" + str(coefficients[0][i]) + "\n")

        output.close()
Esempio n. 2
0
    def get_data(self):

        process_wmt = PrepareWmt()
        data_structure1 = process_wmt.get_data_structure(self.config)
        data_structure2 = process_wmt.get_data_structure2(self.config)
        process_wmt.print_data_set(self.config, data_structure1)

        if 'Parse' in loads(self.config.get("Resources", "processors")):
            process_wmt_parse = PrepareWmt(data_type='parse')
            data_structure_parse = process_wmt_parse.get_data_structure(self.config)
            process_wmt_parse.print_data_set(self.config, data_structure_parse)

        f_judgements = self.config.get('WMT', 'human_ranking')
        maximum_comparisons = int(self.config.get('WMT', 'maximum_comparisons'))
        human_rankings = HumanRanking()
        human_rankings.add_human_data(f_judgements, self.config, max_comparisons=maximum_comparisons)

        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)

        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return data_structure2, human_rankings, extractor.vals
Esempio n. 3
0
    def get_data(self):

        human_scores = read_reference_file(os.path.expanduser(self.config.get('Data', 'human_scores')), '\t')
        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)
        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)

        return extractor.vals, human_scores
Esempio n. 4
0
    def recursive_feature_elimination(config_learning, config_data, number_features):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfe = RFE(estimator, number_features, step=1)
        rfe.fit(x_train, y_train)

        for i, name in enumerate(feature_names):
            output.write(name + "\t" + str(rfe.ranking_[i]) + "\n")
            print(name + "\t" + str(rfe.ranking_[i]))

        predictions = rfe.predict(x_test)

        output.close()

        return predictions
Esempio n. 5
0
def main(args):

    feature_extractor = FeatureExtractor(get_resources(), args.model)
    train_feats, train_labels, _, _ = feature_extractor.get_features(
        args.train, args.train_topk)
    test_feats, _, test_segs, test_gold_truths = feature_extractor.get_features(
        args.test, args.test_topk)

    epochs, lr1, lr2 = 100, 0.01, 0.05

    # Initialize model
    model = None
    if args.model == "mse":
        model = mse_ranker.MSERanker(epochs, lr1)
    elif args.model == "mr":
        model = mr_ranker.MRRanker(epochs, lr1)
    elif args.model == "mse_multi":
        model = mse_multi_ranker.MSEMultiRanker(epochs, lr1, lr2)
    elif args.model == "mr_multi":
        model = mr_multi_ranker.MRMultiRanker(epochs, lr1, lr2)

    # Train model
    model.train(train_feats, train_labels)

    # Rerank top-k segmentations
    top_segmentations = []
    for segs_feats, segs, gds in zip(test_feats, test_segs, test_gold_truths):
        if len(segs) == 1:
            top_segmentations.extend(segs)
        else:
            reranked_segs = rerank(segs, segs_feats, model, args.model)
            top_segmentations.append(reranked_segs)

    if args.output is not None:
        fp = open(args.output, 'w')
        for segs in top_segmentations:
            target = "".join(segs[0].split())
            fp.write(target + "\t" + "\t".join([seg.strip()
                                                for seg in segs]) + "\n")
        fp.close()

    # Evaluate metrics
    print("MRR:", mean_reciprocal_rank(test_gold_truths, top_segmentations))
    print("Accuracy@1:", accuracy(1, test_gold_truths, top_segmentations))
    print("Accuracy@2:", accuracy(2, test_gold_truths, top_segmentations))
    print("Fscore@1:", fscore(1, test_gold_truths, top_segmentations))
    print("Fscore@2:", fscore(2, test_gold_truths, top_segmentations))
Esempio n. 6
0
def get_writing_features(image_path):
    # Pre-processing
    gray_img = cv.imread(image_path, cv.IMREAD_GRAYSCALE)
    gray_img, bin_img = PreProcessor.process(gray_img)
    gray_lines, bin_lines = LineSegmentor(gray_img, bin_img).segment()

    # Feature extraction
    return FeatureExtractor(gray_lines, bin_lines).extract()
Esempio n. 7
0
    def prepare_feature_files(self):

        process_wmt = PrepareWmt()
        data_structure1 = process_wmt.get_data_structure(self.config)
        data_structure2 = process_wmt.get_data_structure2(self.config)
        process_wmt.print_data_set(self.config, data_structure1)

        if 'Parse' in loads(self.config.get("Resources", "processors")):
            process_wmt_parse = PrepareWmt(data_type='parse')
            data_structure_parse = process_wmt_parse.get_data_structure(self.config)
            process_wmt_parse.print_data_set(self.config, data_structure_parse)

        process = Process(self.config)
        sents_tgt, sents_ref = process.run_processors()

        extractor = FeatureExtractor(self.config)
        features_to_extract = FeatureExtractor.read_feature_names(self.config)
        extractor.extract_features(features_to_extract, sents_tgt, sents_ref)
        feature_values = extractor.vals

        datasets_language_pairs = set((x[0], x[1]) for x in data_structure2)

        dataset_for_all = self.config.get('WMT', 'dataset')
        feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "")
        f_features_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w')
        f_meta_data_all = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + dataset_for_all + '.' + feature_set_name + '.' + 'all' + '.tsv', 'w')

        for dataset, lp in sorted(datasets_language_pairs):

            f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + dataset + '.' + feature_set_name + '.' + lp + '.tsv', 'w')

            for i, sentence_data in enumerate(data_structure2):

                if dataset in sentence_data and lp in sentence_data:
                    f_features_all.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")
                    f_meta_data_all.write('\t'.join([str(x) for x in sentence_data]) + "\n")
                    f_features.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")

            f_features.close()

        f_features_all.close()
def feature_extraction(config_features_path):

    config = ConfigParser()
    config.readfp(open(config_features_path))
    wd = config.get('WMT', 'working_directory')
    if not os.path.exists(wd):
        os.mkdir(wd)

    data = RankingData(config)
    data.read_dataset()

    process = Process(config)
    sentences_tgt, sentences_ref = process.run_processors()

    feature_names = FeatureExtractor.read_feature_names(config)
    feature_values = FeatureExtractor.extract_features_static(feature_names, sentences_tgt, sentences_ref)
    write_feature_file(wd + '/' + 'x' + '_' + data.datasets[0].name + '.tsv', feature_values)

    my_dataset = data.plain[0].dataset
    my_lp = data.plain[0].lp
    f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
    f_file = open(f_path, 'w')

    for i, instance in enumerate(data.plain):
        if instance.dataset == my_dataset and instance.lp == my_lp:
            f_file.write('\t'.join([str(x) for x in feature_values[i]]) + "\n")
        else:
            f_file.close()
            my_dataset = instance.dataset
            my_lp = instance.lp
            f_path = wd + '/' + 'x' + '_' + my_dataset + '_' + my_lp + '.tsv'
            f_file = open(f_path, 'w')

    f_judgements = config.get('WMT', 'human_ranking')
    human_rankings = HumanRanking()
    human_rankings.add_human_data(f_judgements, config)
    human_rankings.get_sentence_ids(data)

    learn_to_rank(feature_values, human_rankings, wd + '/' + 'x_learn_to_rank.tsv', wd + '/' + 'y_learn_to_rank.tsv')
Esempio n. 9
0
def get_features(path):
    # Read and pre-process the image
    gray_img = cv.imread(path, cv.IMREAD_GRAYSCALE)
    gray_img, bin_img = PreProcessor.process(gray_img)
    gray_lines, bin_lines = LineSegmentor(gray_img, bin_img).segment()

    # Extract features of every line separately
    x = []
    for g, b in zip(gray_lines, bin_lines):
        f = FeatureExtractor([g], [b]).extract()
        x.append(f)

    # Return list of features for every line in the image
    return x
Esempio n. 10
0
    def recursive_feature_elimination_cv(config_learning, config_data):

        output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w")

        feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data)
        combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data)

        x_train = read_features_file(config_learning.get('x_train'), '\t')
        y_train = read_reference_file(config_learning.get('y_train'), '\t')
        x_test = read_features_file(config_learning.get('x_test'), '\t')
        estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train)

        scale = config_learning.get("scale", True)

        if scale:
            x_train, x_test = scale_datasets(x_train, x_test)

        rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy')
        rfecv.fit(x_train, y_train)

        feature_list = []

        for i, feature_name in enumerate(feature_names):
             if combination_methods[i] == 'both':
                 feature_list.append(feature_name)
                 feature_list.append(feature_name)
             else:
                 feature_list.append(feature_name)

        for i, name in enumerate(feature_list):
            output.write(name + "\t" + str(rfecv.ranking_[i]) + "\n")

        output.close()

        predictions = rfecv.predict(x_test)

        return predictions
Esempio n. 11
0
def average_feature_values():

    config_path = os.getcwd() + "/" + "config" + "/" + "wmt.cfg"
    config = ConfigParser()
    config.readfp(open(config_path))

    my_dir = os.path.expanduser("~/Dropbox/experiments_fluency/test_learn_to_rank")
    feature_file = my_dir + "/" + "x_newstest2015.cobalt_comb_min_fluency_features_all.cs-en.tsv"
    feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config)
    strategies = FeatureExtractor.get_combinations_from_config_file(config)

    feature_values = read_features_file(feature_file, "\t")
    averages = np.mean(feature_values, axis=0)

    feature_list = []
    for i, feature_name in enumerate(feature_names):
        # if strategies[i] == 'both':
        #     feature_list.append(feature_name)
        #     feature_list.append(feature_name)
        # else:
        feature_list.append(feature_name)

    for i, name in enumerate(feature_list):
        print(name + "\t" + str(averages[i]))
def test_feature_sets():

    cfg = ConfigParser()
    cfg.readfp(open(os.getcwd() + '/config/system.cfg'))

    group_name = FE.get_features_group_name(cfg)
    features_to_test = FE.read_feature_names(cfg)

    if os.path.exists(cfg.get('Data', 'output') + '/' + group_name + '.' + 'summary'):
        "Path exists!"
        return

    output_file = open(cfg.get('Data', 'output') + '/' + group_name + '.' + 'summary', 'w')

    name0 = group_name + '_' + 'all'
    corr0 = corr_feature_set(features_to_test, name0)
    output_file.write(name0 + '\t' + str(corr0) + '\n')

    for feat in features_to_test:

        name1 = group_name + '_' + feat + '_' + 'only'
        corr1 = corr_feature_set(feat, name1)
        output_file.write(name1 + '\t' + str(corr1) + '\n')

        name2 = group_name + '_' + feat + '_' + 'excluded'
        excluding = []

        for ffeat in features_to_test:
            if ffeat == feat:
                continue
            excluding.append(ffeat)

        corr2 = corr_feature_set(excluding, name2)
        output_file.write(name2 + '\t' + str(corr2) + '\n')

    output_file.close()
Esempio n. 13
0
def extract_features(txt_io, feat_io, cword_io, train=False, factor_files={}):
    csets = CSetPair(config['source-cset'], config['target-cset'])
    extractor = FeatureExtractor(csets, config['features'], config['costs'])

    check_factor_requirements(extractor.required_factors(), factor_files)

    finder = CWordFinder(csets, train)
    if config['nulls-ngrams']:
        null_finder = NullFinder(csets.src, config['nulls-ngrams'])
        finder.add_extra_finder(null_finder)
    reader = CWordReader(cword_io)

    log.info("Extract features from {}".format(txt_io.name))

    count = 0
    for sid, line, fact_sent in each_factorized_input(txt_io, factor_files):
        for cword in finder.find_confusion_words(line, fact_sent):
            feat_str = extractor.extract_features(cword, fact_sent)
            feat_io.write(feat_str)

            reader.format(sid, cword)
            count += 1

    log.info("Found {} confusion words".format(count))
Esempio n. 14
0
    def test_set_for_rank_to_scores(self, data_structure, feature_values, config_path_learning):

        sentences_systems = defaultdict(list)

        combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config)
        data_set_name = self.config.get('WMT', 'dataset')
        f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name, 'w')
        meta_data = defaultdict(list)

        for data_set, lang_pair, system_name, phrase_number in data_structure:
            sentences_systems[data_set, lang_pair, phrase_number].append(system_name)

        for data_set, lang_pair, phrase_number in sorted(sentences_systems.keys()):

            system_pairs = list(combinations(sentences_systems[data_set, lang_pair, phrase_number], 2))

            for sys1, sys2 in sorted(system_pairs):

                idx_sys1, idx_sys2 = self.get_sentence_idx(data_set, lang_pair, data_structure, phrase_number, sys1, sys2)

                combined_features = []
                for i in range(len(feature_values[0])):
                    combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i],
                                                                   feature_values[idx_sys2][i])
                    combined_features.append(combined_feature)

                f_features.write('\t'.join([val for val in combined_features]) + '\n')
                meta_data[data_set, lang_pair, phrase_number].append([sys1, sys2])

        f_features.close()

        results = defaultdict(list)
        confidence_scores = self.get_confidence_scores(config_path_learning)
        count = 0
        for data_set, lang_pair, phrase_number in sorted(meta_data.keys()):
            for sys1, sys2 in sorted(meta_data[data_set, lang_pair, phrase_number]):
                results[data_set, lang_pair, phrase_number].append([sys1, sys2, confidence_scores[count]])
                count += 1

        return results
Esempio n. 15
0
def get_writer_features(path, writer_id):
    # All lines of the writer
    total_gray_lines, total_bin_lines = [], []

    # Read and append all lines of the writer
    for root, dirs, files in os.walk(path):
        for filename in files:
            gray_img = cv.imread(path + filename, cv.IMREAD_GRAYSCALE)
            gray_img, bin_img = PreProcessor.process(gray_img)
            gray_lines, bin_lines = LineSegmentor(gray_img, bin_img).segment()
            total_gray_lines.extend(gray_lines)
            total_bin_lines.extend(bin_lines)
        break

    # Extract features of every line separately
    x, y = [], []
    for g, b in zip(total_gray_lines, total_bin_lines):
        f = FeatureExtractor([g], [b]).extract()
        x.append(f)
        y.append(writer_id)

    return x, y
Esempio n. 16
0
    def training_set_for_rank_direct(self, data_structure, human_rankings, feature_values, ignore_ties=True):

        combination_methods = FeatureExtractor.get_combinations_from_config_file(self.config)
        data_set_name = self.config.get('WMT', 'dataset')
        feature_set_name = os.path.basename(self.config.get('Features', 'feature_set')).replace(".txt", "")

        for dataset, lang_pair in sorted(human_rankings.keys()):

            f_features = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'x_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w')
            f_objective = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'y_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w')
            f_meta_data = open(os.path.expanduser(self.config.get('WMT', 'output_dir')) + '/' + 'meta_' + data_set_name + '.' + feature_set_name + '.' + lang_pair + '.tsv', 'w')

            for human_comparison in human_rankings[dataset, lang_pair]:

                label = self.signs_to_labels(human_comparison.sign, ignore_ties=ignore_ties)
                if label is None:
                    continue

                f_objective.write(label + '\n')

                seg_id = human_comparison.phrase
                sys1 = human_comparison.sys1
                sys2 = human_comparison.sys2
                idx_sys1, idx_sys2 = self.get_sentence_idx(dataset, lang_pair, data_structure, seg_id, sys1, sys2)
                f_meta_data.write(str(idx_sys1) + '\t' + str(idx_sys2) + '\n')

                combined_features = []
                for i in range(len(feature_values[0])):
                    combined_feature = self.combine_feature_values(combination_methods[i], feature_values[idx_sys1][i],
                                                                   feature_values[idx_sys2][i])
                    combined_features.append(combined_feature)

                f_features.write('\t'.join([val for val in combined_features]) + '\n')

            f_features.close()
            f_objective.close()
            f_meta_data.close()
Esempio n. 17
0
 def __init__(self):
     FeatureExtractor.__init__(self, 'sentence_score')
Esempio n. 18
0
 def __init__(self):
     FeatureExtractor.__init__(self, 'word_size')
Esempio n. 19
0
 def __init__(self):
     FeatureExtractor.__init__(self, 'num_lines')
Esempio n. 20
0
 def __init__(self):
     FeatureExtractor.__init__(self, 'repetition_score')
     self.stemmer = SnowballStemmer("english")
Esempio n. 21
0
 def __init__(self):
     FeatureExtractor.__init__(self, 'obscurity_score')
     self.common = self.get_wordlist(1000)
Esempio n. 22
0
import sys
import os

from configparser import ConfigParser
from utils.ranking_data import RankingData
from utils.write_parsed import write_parsed
from utils.human_ranking import HumanRanking
from processors.process import Process
from features.feature_extractor import FeatureExtractor
from utils.wmt import write_wmt_format
from utils.process_semeval import process_semeval
from nltk.corpus import stopwords

# Read configuration file
config = ConfigParser()
config.readfp(open('test.cfg'))

# Prepare dataset
ranking_data = RankingData(config)
ranking_data.read_dataset()
ranking_data.write_dataset()
write_parsed(config.get('Data', 'input_dir').replace('plain', 'parse'), config.get('Data', 'working_dir'), ['cs-en'])

# Process dataset
process = Process(config)
sentences_target, sentences_reference = process.run_processors()
cobalt_scores = FeatureExtractor.extract_features_static(['cobalt'], sentences_target, sentences_reference)
# print(str(cobalt_scores[-1][0]))
ranking_data.write_scores_wmt_format(cobalt_scores, metric='cobalt', output_path='output/cobalt.scores')
Esempio n. 23
0
 def __init__(self):
     FeatureExtractor.__init__(self, 'width_in_char')
Esempio n. 24
0
# Choose statistical features and physical parameters
features_list = [fc.get_min, fc.get_max, fc.get_median]
params_name_list = ['TOTUSJH', 'TOTBSQ', 'TOTPOT', 'TOTUSJZ']

# Input and output path
#path_to_root = os.path.join('..', CONST.IN_PATH_TO_MVTS_FL)
#path_to_dest = os.path.join('..', CONST.OUT_PATH_TO_RAW_FEATURES)
#path_to_root =  CONST.IN_PATH_TO_MVTS_FL

path_to_root = CONST.IN_PATH_TO_MVTS_FL2
path_to_dest = CONST.OUT_PATH_TO_RAW_FEATURES
output_filename = 'raw_features_p2_FL.csv'

# Extract features
pc = FeatureExtractor(path_to_root, path_to_dest, output_filename)
pc.calculate_all(features_list, params_name_list=params_name_list)

path_to_root = CONST.IN_PATH_TO_MVTS_NF2
path_to_dest = CONST.OUT_PATH_TO_RAW_FEATURES
output_filename = 'raw_features_p2_NF.csv'

# Extract features
pc = FeatureExtractor(path_to_root, path_to_dest, output_filename)
pc.calculate_all(features_list, params_name_list=params_name_list)

path_to_root = CONST.IN_PATH_TO_MVTS_FL3
path_to_dest = CONST.OUT_PATH_TO_RAW_FEATURES
output_filename = 'raw_features_p3_FL.csv'

# Extract features
Esempio n. 25
0
 def __init__(self):
     FeatureExtractor.__init__(self, 'num_words')