def __init__(self, ngram_file=None, corpus_file=None, srilm=None, tmp_dir=None, order=5):
        # generate ngram counts
        if ngram_file is None:
            if srilm is None:
                if 'SRILM' in os.environ:
                    srilm = os.environ['SRILM']
                else:
                    print("No SRILM found")
                    return
            if corpus_file is None:
                print ("No corpus for LM generation")
                return

            srilm_ngram_count = os.path.join(srilm, 'ngram-count')

            tmp_dir = mk_tmp_dir(tmp_dir)
            lm_file = os.path.join(tmp_dir, 'lm_file')
            ngram_file = os.path.join(tmp_dir, 'ngram_count_file')
            call([srilm_ngram_count, '-text', corpus_file, '-lm', lm_file, '-order', str(order), '-write', ngram_file])

        self.lm = defaultdict(int)
        for line in codecs.open(ngram_file, encoding='utf-8'):
            chunks = line[:-1].split('\t')
            if len(chunks) == 2:
                new_tuple = tuple(chunks[0].split())
                new_number = int(chunks[1])
                self.lm[new_tuple] = new_number
            else:
                print("Wrong ngram-counts file format at line '", line[:-1], "'")

        self.order = order
    def __init__(self, ngram_file=None, corpus_file=None, srilm=None, tmp_dir=None, order=5):
        # generate ngram counts
        if ngram_file is None:
            if srilm is None:
                if 'SRILM' in os.environ:
                    srilm = os.environ['SRILM']
                else:
                    print("No SRILM found")
                    return
            if corpus_file is None:
                print ("No corpus for LM generation")
                return

            srilm_ngram_count = os.path.join(srilm, 'ngram-count')

            tmp_dir = mk_tmp_dir(tmp_dir)
            lm_file = os.path.join(tmp_dir, 'lm_file')
            ngram_file = os.path.join(tmp_dir, 'ngram_count_file')
            call([srilm_ngram_count, '-text', corpus_file, '-lm', lm_file, '-order', str(order), '-write', ngram_file])

        self.lm = defaultdict(int)
        for line in codecs.open(ngram_file, encoding='utf-8'):
            chunks = line[:-1].split('\t')
            if len(chunks) == 2:
                new_tuple = tuple(chunks[0].split())
                new_number = int(chunks[1])
                self.lm[new_tuple] = new_number
            else:
                print("Wrong ngram-counts file format at line '", line[:-1], "'")

        self.order = order
    def __init__(self,
                 align_model=None,
                 src_file=None,
                 tg_file=None,
                 lex_prefix=None,
                 tmp_dir=None,
                 moses_dir=None,
                 moses_config=None,
                 workers=1):

        self.tmp_dir = mk_tmp_dir(tmp_dir)
        self.time_stamp = str(time.time())
        self.moses_dir = moses_dir
        self.moses_config = moses_config
        self.workers = workers
        self.lex_prob = lex_prefix

        if align_model is None:
            if src_file is not None and tg_file is not None:
                self.align_model = train_alignments(src_file,
                                                    tg_file,
                                                    tmp_dir,
                                                    align_model=align_model)
            else:
                print("Alignment model not defined, no files for training")
                return
        else:
            self.align_model = align_model
Exemple #4
0
def train_alignments(src_train, tg_train, tmp_dir, align_model='align_model'):
    cdec = os.environ['CDEC_HOME']
    if cdec == '':
        sys.stderr.write(
            'No CDEC_HOME variable found. Please install cdec and/or set the variable\n'
        )
        return ''
    if src_train == '' or tg_train == '':
        sys.stderr.write('No parallel corpus for training\n')
        return ''
    # join source and target files
    tmp_dir = mk_tmp_dir(tmp_dir)
    shutil.copy(src_train, tmp_dir)
    shutil.copy(tg_train, tmp_dir)
    joint_name = os.path.join(
        tmp_dir,
        os.path.basename(src_train) + '_' + os.path.basename(tg_train))
    src_tg_file = open(joint_name, 'w')
    get_corp = Popen([cdec + '/corpus/paste-files.pl', src_train, tg_train],
                     stdout=src_tg_file)
    get_corp.wait()
    src_tg_file.close()

    src_tg_clean = open(joint_name + '.clean', 'w')
    clean_corp = Popen([cdec + '/corpus/filter-length.pl', joint_name],
                       stdout=src_tg_clean)
    clean_corp.wait()
    src_tg_clean.close()

    align_model_full = os.path.join(tmp_dir, align_model)
    # train the alignment model
    fwd_align = open(align_model_full + '.fwd_align', 'w')
    rev_align = open(align_model_full + '.rev_align', 'w')
    fwd_err = open(align_model_full + '.fwd_err', 'w')
    rev_err = open(align_model_full + '.rev_err', 'w')

    fwd = Popen([
        cdec + '/word-aligner/fast_align', '-i' + joint_name + '.clean', '-d',
        '-v', '-o', '-p' + align_model_full + '.fwd_params'
    ],
                stdout=fwd_align,
                stderr=fwd_err)
    rev = Popen([
        cdec + '/word-aligner/fast_align', '-i' + joint_name + '.clean', '-r',
        '-d', '-v', '-o', '-p' + align_model_full + '.rev_params'
    ],
                stdout=rev_align,
                stderr=rev_err)
    fwd.wait()
    rev.wait()

    fwd_align.close()
    rev_align.close()
    fwd_err.close()
    rev_err.close()

    return align_model_full
    def __init__(self, align_model=None, src_file=None, tg_file=None, tmp_dir=None):

        tmp_dir = mk_tmp_dir(tmp_dir)

        if align_model is None:
            if src_file is not None and tg_file is not None:
                self.align_model = train_alignments(src_file, tg_file, tmp_dir, align_model=align_model)
            else:
                print("Alignment model not defined, no files for training")
                return
        else:
            self.align_model = align_model
    def __init__(self, lex_file, align_model=None, src_file=None, tg_file=None, tmp_dir=None):

        tmp_dir = mk_tmp_dir(tmp_dir)

        if align_model is None:
            if src_file is not None and tg_file is not None:
                self.align_model = train_alignments(src_file, tg_file, tmp_dir, align_model=align_model)
            else:
                print("Alignment model not defined, no files for training")
                return
        else:
            self.align_model = align_model
        self.lex_prob = self.get_align_prob(lex_file)
    def __init__(self, lex_file, align_model=None, src_file=None, tg_file=None, tmp_dir=None):

        tmp_dir = mk_tmp_dir(tmp_dir)
        self.tmp = tmp_dir

        if align_model is None:
            if src_file is not None and tg_file is not None:
                align_model = 'align_model'
                self.align_model = train_alignments(src_file, tg_file, tmp_dir, align_model=align_model)
            else:
                print("Alignment model not defined, no files for training")
                return
        else:
            self.align_model = align_model
        self.lex_prob = self.get_align_prob(lex_file)
    def _parse_wmt_to_text(self,
                           wmt_file,
                           wmt_source_file,
                           tmp_dir,
                           persist=False):

        # parse source files
        source_sents = {}
        for line in open(wmt_source_file):
            str_num = line.decode('utf-8').strip().split('\t')
            source_sents[str_num[0]] = word_tokenize(str_num[1])

        # parse target file and write new source, target, and tag files
        target, source, tags = [], [], []
        cur_num = None
        cur_sent, cur_tags = [], []
        for line in open(wmt_file):
            chunks = line[:-1].decode('utf-8').split('\t')
            if chunks[0] != cur_num:
                if len(cur_sent) > 0:
                    # check that the sentence is in source
                    if cur_num in source_sents:
                        source.append(source_sents[cur_num])
                        target.append(cur_sent)
                        tags.append(cur_tags)
                    cur_sent = []
                    cur_tags = []
                cur_num = chunks[0]
            cur_sent.append(chunks[2])
            cur_tags.append(chunks[5])
        # last sentence
        if len(cur_sent) > 0 and cur_num in source_sents:
            source.append(source_sents[cur_num])
            target.append(cur_sent)
            tags.append(cur_tags)

        if persist:
            tmp_dir = mk_tmp_dir(tmp_dir)
            target_file = tmp_dir + '/' + os.path.basename(
                wmt_file) + '.target'
            tags_file = tmp_dir + '/' + os.path.basename(wmt_file) + '.tags'
            source_file = tmp_dir + '/' + os.path.basename(
                wmt_source_file) + '.txt'
            self._write_to_file(target_file, target)
            self._write_to_file(source_file, source)
            self._write_to_file(tags_file, tags)

        return {'target': target, 'source': source, 'tags': tags}
    def __init__(self, align_model=None, src_file=None, tg_file=None, lex_prefix=None, tmp_dir=None, moses_dir=None, moses_config=None, workers=1):

        self.tmp_dir = mk_tmp_dir(tmp_dir)
        self.time_stamp = str(time.time())
        self.moses_dir = moses_dir
        self.moses_config = moses_config
        self.workers = workers
        self.lex_prob = lex_prefix

        if align_model is None:
            if src_file is not None and tg_file is not None:
                self.align_model = train_alignments(src_file, tg_file, tmp_dir, align_model=align_model)
            else:
                print("Alignment model not defined, no files for training")
                return
        else:
            self.align_model = align_model
    def _parse_wmt_to_text(self, wmt_file, wmt_source_file, tmp_dir, persist=False):

        # parse source files
        source_sents = {}
        for line in open(wmt_source_file):
            str_num = line.decode('utf-8').strip().split('\t')
            source_sents[str_num[0]] = word_tokenize(str_num[1])

        # parse target file and write new source, target, and tag files
        target, source, tags = [], [], []
        cur_num = None
        cur_sent, cur_tags = [], []
        for line in open(wmt_file):
            chunks = line[:-1].decode('utf-8').split('\t')
            if chunks[0] != cur_num:
                if len(cur_sent) > 0:
                    # check that the sentence is in source
                    if cur_num in source_sents:
                        source.append(source_sents[cur_num])
                        target.append(cur_sent)
                        tags.append(cur_tags)
                    cur_sent = []
                    cur_tags = []
                cur_num = chunks[0]
            cur_sent.append(chunks[2])
            cur_tags.append(chunks[5])
        # last sentence
        if len(cur_sent) > 0 and cur_num in source_sents:
            source.append(source_sents[cur_num])
            target.append(cur_sent)
            tags.append(cur_tags)

        if persist:
            tmp_dir = mk_tmp_dir(tmp_dir)
            target_file = tmp_dir+'/'+os.path.basename(wmt_file)+'.target'
            tags_file = tmp_dir+'/'+os.path.basename(wmt_file)+'.tags'
            source_file = tmp_dir+'/'+os.path.basename(wmt_source_file)+'.txt'
            self._write_to_file(target_file, target)
            self._write_to_file(source_file, source)
            self._write_to_file(tags_file, tags)

        return {'target': target, 'source': source, 'tags': tags}
Exemple #11
0
def train_alignments(src_train, tg_train, tmp_dir, align_model='align_model'):
    cdec = os.environ['CDEC_HOME']
    if cdec == '':
        sys.stderr.write('No CDEC_HOME variable found. Please install cdec and/or set the variable\n')
        return ''
    if src_train == '' or tg_train == '':
        sys.stderr.write('No parallel corpus for training\n')
        return ''
    # join source and target files
    tmp_dir = mk_tmp_dir(tmp_dir)
    shutil.copy(src_train, tmp_dir)
    shutil.copy(tg_train, tmp_dir)
    joint_name = os.path.join(tmp_dir, os.path.basename(src_train) + '_' + os.path.basename(tg_train))
    src_tg_file = open(joint_name, 'w')
    get_corp = Popen([cdec+'/corpus/paste-files.pl', src_train, tg_train], stdout=src_tg_file)
    get_corp.wait()
    src_tg_file.close()

    src_tg_clean = open(joint_name+'.clean', 'w')
    clean_corp = Popen([cdec+'/corpus/filter-length.pl', joint_name], stdout=src_tg_clean)
    clean_corp.wait()
    src_tg_clean.close()

    align_model_full = os.path.join(tmp_dir, align_model)
    # train the alignment model
    fwd_align = open(align_model_full+'.fwd_align', 'w')
    rev_align = open(align_model_full+'.rev_align', 'w')
    fwd_err = open(align_model_full+'.fwd_err', 'w')
    rev_err = open(align_model_full+'.rev_err', 'w')

    fwd = Popen([cdec+'/word-aligner/fast_align', '-i'+joint_name+'.clean', '-d', '-v', '-o', '-p'+align_model_full+'.fwd_params'], stdout=fwd_align, stderr=fwd_err)
    rev = Popen([cdec+'/word-aligner/fast_align', '-i'+joint_name+'.clean', '-r', '-d', '-v', '-o', '-p'+align_model_full+'.rev_params'], stdout=rev_align, stderr=rev_err)
    fwd.wait()
    rev.wait()

    fwd_align.close()
    rev_align.close()
    fwd_err.close()
    rev_err.close()

    return align_model_full
Exemple #12
0

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("train_file", action="store", help="path to the training features in CRFSuite format")
    parser.add_argument("test_file", action="store", help="path to the test features in CRFSuite format")
    parser.add_argument("method", help="crf_suite | crfpp | svm_light")
    parser.add_argument("representation", help="sequence | plain")
    parser.add_argument("--params", default='', help="training params, string")
    parser.add_argument("--test_params", default='', help="test params, string")
    parser.add_argument("--tmp", default=None, action="store", help="temporary directory")
    args = parser.parse_args()

    tmp_dir = args.tmp if args.tmp is not None else os.path.join(os.path.dirname(os.path.realpath(__file__)), 'tmp_dir')
    tmp_dir = os.path.abspath(tmp_dir)
    tmp_dir = mk_tmp_dir(tmp_dir)
    stamp = args.method
    if args.params != '':
        stamp += ('.' + args.params.replace(' ', '_'))
    print("Stamp: ", stamp)
    if args.representation == 'sequence':
        sequence = True
    elif args.representation == 'plain':
        sequence = False
    else:
        print("Unknown representation: {}".format(args.representation))

    if args.method == 'crf_suite':
        model = os.path.join(tmp_dir, 'crfsuite_model_file' + stamp)
        test_tags = get_test_tags(args.test_file)
        call(['crfsuite', 'learn'] + args.params.split() + ['-m', model, args.train_file])
Exemple #13
0
def main(config):
    workers = config['workers']
    tmp_dir = config['tmp_dir'] if 'tmp_dir' in config else None
    tmp_dir = mk_tmp_dir(tmp_dir)
    time_stamp = str(time.time())

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    train_data_generators = build_objects(config['datasets']['training'])
    train_data = {}
    for gen in train_data_generators:
        data = gen.generate()
        for key in data:
            if key not in train_data:
                train_data[key] = []
            train_data[key].extend(data[key])
    # test
    test_data_generator = build_object(config['datasets']['test'][0])
    test_data = test_data_generator.generate()

    logger.info("Train data keys: {}".format(train_data.keys()))
    logger.info("Train data sequences: {}".format(len(train_data['target'])))
    logger.info("Sample sequence: {}".format([w.encode('utf-8') for w in train_data['target'][0]]))
#    logger.info("Sample sequence: {}".format(train_data['similarity'][0]))
#    sys.exit()

    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        test_data = r.generate(test_data)

#    borders = config['borders'] if 'borders' in config else False

#    if 'multiply_data_train' not in config:
#        pass
#    elif config['multiply_data_train'] == 'ngrams':
#        train_data = multiply_data_ngrams(train_data, borders=borders)
#    elif config['multiply_data_train'] == '1ton':
#        train_data = multiply_data(train_data, borders=borders)
#    elif config['multiply_data_train'] == 'duplicate':
#        train_data = multiply_data_base(train_data)
#    elif config['multiply_data_train'] == 'all':
#        train_data = multiply_data_all(train_data, borders=borders)
#    else:
#        print("Unknown 'multiply data train' value: {}".format(config['multiply_data_train']))
#    logger.info("Extended train representations: {}".format(len(train_data['target'])))
#    logger.info("Simple test representations: {}".format(len(test_data['target'])))
#    if 'multiply_data_test' not in config:
#        pass
#    elif config['multiply_data_test'] == 'ngrams':
#        test_data = multiply_data_ngrams(test_data, borders=borders)
#    elif config['multiply_data_test'] == '1ton':
#        test_data = multiply_data(test_data, borders=borders)
#    else:
#        print("Unknown 'multiply data test' value: {}".format(config['multiply_data_test']))
#    logger.info("Extended test representations: {}".format(len(test_data['target'])))
    
    logger.info('here are the keys in your representations: {}'.format(train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['contexts'] if 'contexts' in config else 'plain'

    test_contexts = create_contexts(test_data, data_type=data_type)
    test_contexts_seq = create_contexts(test_data, data_type='sequential')
    train_contexts = create_contexts(train_data, data_type=data_type)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'], test_data['target']]))
 
    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type)
    test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type)
    test_tags_seq = call_for_each_element(test_contexts_seq, tags_from_contexts, data_type='sequential')

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info('All of your features now exist in their raw representation, but they may not be numbers yet')
    # END FEATURE EXTRACTION

    # BEGIN CONVERTING FEATURES TO NUMBERS
    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]
        features_num = len(feature_names)
        true_features_num = len(all_values[0])

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')
    logger.info('training and test sets successfully generated')

    # the way that we persist depends upon the structure of the data (plain/sequence/token_dict)
    # TODO: remove this once we have a list containing all datasets
    if config['features']['persist']:
        if 'persist_format' in config['features']:
            persist_format = config['features']['persist_format']
        else:
            persist_format = 'crf++'
        experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}]
        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]

        if config['features']['persist_dir']:
            persist_dir = config['features']['persist_dir']
        else:
            persist_dir = os.path.getcwd()
        logger.info('persisting your features to: {}'.format(persist_dir))
        # for each dataset, write a file and persist the features
        for dataset_obj in experiment_datasets:
            persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format)

    # BEGIN LEARNING

    # TODO: different sequence learning modules need different representation, we should wrap them in a class
    # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different
    from sklearn.metrics import f1_score, precision_score, recall_score
    import numpy as np

    experiment_datasets = [{'name': 'test', 'features': test_features, 'tags': test_tags}, {'name': 'train', 'features': train_features, 'tags': train_tags}]
    feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]
    
    print("FEATURE NAMES: ", feature_names)
    persist_dir = tmp_dir
    logger.info('persisting your features to: {}'.format(persist_dir))
    # for each dataset, write a file and persist the features
    if 'persist_format' not in config:
        config['persist_format'] = 'crf_suite'
    for dataset_obj in experiment_datasets:
        persist_features(dataset_obj['name']+time_stamp, dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=config['persist_format'])

    feature_num = len(train_features[0][0])
    train_file = os.path.join(tmp_dir, 'train'+time_stamp+'.crf')
    test_file = os.path.join(tmp_dir, 'test'+time_stamp+'.crf')

    tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1}
    if config['persist_format'] == 'crf++':
        # generate a template for CRF++ feature extractor
        generate_crf_template(feature_num, 'template', tmp_dir)
        # train a CRF++ model
        call(['crf_learn', '-a', 'MIRA', os.path.join(tmp_dir, 'template'), train_file, os.path.join(tmp_dir, 'crfpp_model_file'+time_stamp)])
        # tag a test set
        call(['crf_test', '-m', os.path.join(tmp_dir, 'crfpp_model_file'+time_stamp), '-o', test_file+'.tagged', test_file])
    elif config['persist_format'] == 'crf_suite':
        crfsuite_algorithm = config['crfsuite_algorithm']
        call(['crfsuite', 'learn', '-a', crfsuite_algorithm, '-m', os.path.join(tmp_dir, 'crfsuite_model_file'+time_stamp), train_file])
        test_out = open(test_file+'.tagged', 'w')
        call(['crfsuite', 'tag', '-tr', '-m', os.path.join(tmp_dir, 'crfsuite_model_file'+time_stamp), test_file], stdout=test_out)
        test_out.close()
    else:
        print("Unknown persist format: {}".format(config['persist_format']))

    # parse CRFSuite output
    flattened_ref, flattened_hyp = [], []
    tag_map = {'OK': 1, 'BAD': 0}
    for line in open(test_file+'.tagged'):
        if line == "\n":
            continue
        chunks = line.strip('\n').split('\t')
        if len(chunks) != 2:
            continue
        try:
            flattened_ref.append(tag_map[chunks[-2]])
            flattened_hyp.append(tag_map[chunks[-1]])
        except KeyError:
            continue

    print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp))
    logger.info('Structured prediction f1: ')
    print(f1_score(flattened_ref, flattened_hyp, average=None))
    print(f1_score(flattened_ref, flattened_hyp, average='weighted', pos_label=None))
    logger.info("Sequence correlation: ")
Exemple #14
0
def main(config):
    workers = config['workers']
    tmp_dir = config['tmp_dir']
    tmp_dir = mk_tmp_dir(tmp_dir)

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
#    train_data_generators = build_objects(config['datasets']['training'])
#    train_data = {}
#    for gen in train_data_generators:
#        data = gen.generate()
#        for key in data:
#            if key not in train_data:
#                train_data[key] = []
#            train_data[key].extend(data[key])
    train_data_generator = build_object(config['datasets']['training'][0])
    train_data = train_data_generator.generate()
    dev, test = False, False
    # test
    if 'test' in config['datasets']:
        test = True
        test_data_generator = build_object(config['datasets']['test'][0])
        test_data = test_data_generator.generate()

    # dev
    if 'dev' in config['datasets']:
        dev = True
        dev_data_generator = build_object(config['datasets']['dev'][0])
        dev_data = dev_data_generator.generate()
    # additional representations
#    print("IN MAIN")
#    print(train_data['alignments_file'])
#    print(dev_data['alignments_file'])
#    print(test_data['alignments_file'])
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        if test:
            test_data = r.generate(test_data)
        if dev:
            dev_data = r.generate(dev_data)

    print("TEST DATA", test_data['alignments'][0])
    logger.info("Simple representations: {}".format(len(train_data['target'])))
    logger.info('here are the keys in your representations: {}'.format(train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['data_type']
    print("DATA TYPE:", data_type)
#    sys.exit()
    print("Train data: ", len(train_data['target']))
    if dev:
        print("Dev data: ", len(dev_data['target']))
    if test:
        print("Test data: ", len(test_data['target']))
    print("In different representations: ")

    for rep in train_data:
        print(rep, len(train_data[rep]))
#    print('Source dependencies: {}'.format(train_data['source_dependencies'][0]))
#    print('Target dependencies: {}'.format(train_data['target_dependencies'][0]))
#    print('Source root: {}'.format(train_data['source_root'][0]))
#    print('Target root: {}'.format(train_data['target_root'][0]))
    train_contexts = create_contexts(train_data, data_type=data_type)
    if test:
        test_contexts = create_contexts(test_data, data_type=data_type)
        logger.info('Vocabulary comparison -- coverage for test dataset: ')
        logger.info(compare_vocabulary([train_data['target'], test_data['target']]))
    if dev:
        dev_contexts = create_contexts(dev_data, data_type=data_type)
#    print("TEST CONTEXT", test_contexts[0])
    print("Train contexts: ", len(train_contexts))
    if dev:
        print("Dev contexts: ", len(dev_contexts))
    if test:
        print("Test contexts: ", len(test_contexts))
    print('Train context example: {}'.format(train_contexts[0]))


    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type)
    if test:
        test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type)
    if dev:
        dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type)
    print("Train tags: ", len(train_tags))
    if dev:
        print("Dev tags: ", len(dev_tags))
    if test:
        print("Test tags: ", len(test_tags))

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    if test:
        logger.info('mapping the feature extractors over the contexts for test...')
        test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
        print("Test features sample: ", test_features[0])
    if dev:
        logger.info('mapping the feature extractors over the contexts for dev...')
        dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, 1], data_type=data_type)
    print("Train features sample: ", train_features[0])

    logger.info('number of training instances: {}'.format(len(train_features)))
    if dev:
        logger.info('number of development instances: {}'.format(len(dev_features)))
    if test:
        logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info('All of your features now exist in their raw representation, but they may not be numbers yet')
    # END FEATURE EXTRACTION

    # binarizing features
    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]
        features_num = len(feature_names)
        true_features_num = len(all_values[0])

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')
        logger.info('training and test sets successfully generated')

    # persisting features
    logger.info('training and test sets successfully generated')

    experiment_datasets = [{'name': 'train', 'features': train_features, 'tags': train_tags}]
    if test:
        experiment_datasets.append({'name': 'test', 'features': test_features, 'tags': test_tags})
    if dev:
        experiment_datasets.append({'name': 'dev', 'features': dev_features, 'tags': dev_tags})
    feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]

    persist_dir = config['persist_dir'] if 'persist_dir' in config else config['features']['persist_dir']
    persist_dir = mk_tmp_dir(persist_dir)
    persist_format = config['persist_format'] if 'persist_format' in config else config['features']['persist_format']
    logger.info('persisting your features to: {}'.format(persist_dir))
    # for each dataset, write a file and persist the features
    for dataset_obj in experiment_datasets:
#        persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format)
        persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format)
    # generate a template for CRF++ feature extractor
    feature_num = len(feature_names)
    if persist_format == 'crf++':
        generate_crf_template(feature_num, 'template', persist_dir)

    logger.info('Features persisted to: {}'.format(', '.join([os.path.join(persist_dir, nn) for nn in [obj['name'] for obj in experiment_datasets]])))
Exemple #15
0
def main(config):
    workers = config['workers']
    tmp_dir = config['tmp_dir']
    tmp_dir = mk_tmp_dir(tmp_dir)

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    train_data_generators = build_objects(config['datasets']['training'])
    train_data = {}
    for gen in train_data_generators:
        data = gen.generate()
        for key in data:
            if key not in train_data:
                train_data[key] = []
            train_data[key].extend(data[key])
    dev, test = False, False
    # test
    if 'test' in config['datasets']:
        test = True
        test_data_generator = build_object(config['datasets']['test'][0])
        test_data = test_data_generator.generate()

    # dev
    if 'dev' in config['datasets']:
        dev = True
        dev_data_generator = build_object(config['datasets']['dev'][0])
        dev_data = dev_data_generator.generate()
    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        if test:
            test_data = r.generate(test_data)
        if dev:
            dev_data = r.generate(dev_data)

    logger.info("Simple representations: {}".format(len(train_data['target'])))
    logger.info('here are the keys in your representations: {}'.format(train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['contexts']
    print("DATA TYPE:", data_type)
#    sys.exit()

    train_contexts = create_contexts(train_data, data_type=data_type)
    if test:
        test_contexts = create_contexts(test_data, data_type=data_type)
    if dev:
        dev_contexts = create_contexts(dev_data, data_type=data_type)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'], test_data['target']]))

    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type)
    if test:
        test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type)
    if dev:
        dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type)

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    if test:
        logger.info('mapping the feature extractors over the contexts for test...')
        test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
        print("Test features sample: ", test_features[0])
    if dev:
        logger.info('mapping the feature extractors over the contexts for dev...')
        dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, 1], data_type=data_type)
    print("Train features sample: ", train_features[0])

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info('All of your features now exist in their raw representation, but they may not be numbers yet')
    # END FEATURE EXTRACTION

    # binarizing features
    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]
        features_num = len(feature_names)
        true_features_num = len(all_values[0])

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features, binarize, [binarizers], data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features, binarize, [binarizers], data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')
        logger.info('training and test sets successfully generated')

    # persisting features
    logger.info('training and test sets successfully generated')

#    experiment_datasets = [{'name': 'train', 'features': train_features, 'tags': train_tags}]
#    if test:
#        experiment_datasets.append({'name': 'test', 'features': test_features, 'tags': test_tags})
#    if dev:
#        experiment_datasets.append({'name': 'dev', 'features': dev_features, 'tags': dev_tags})
#    feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]

    feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]
    persist_dir = config['persist_dir'] if 'persist_dir' in config else config['features']['persist_dir']
    persist_dir = mk_tmp_dir(persist_dir)
#    train_file_name, test_file_name, inv_test_file_name = persist_to_svm_dbl(train_features, test_features, feature_names, train_tags, test_tags, persist_dir)
    train_file_name, test_file_name = persist_to_svm_blind(train_features, test_features, train_tags, test_tags, feature_names, persist_dir)
    model_name = os.path.join(persist_dir, 'model')
    logger.info("Start training")
    kernel = 0  # linear kernel (default)
    if 'svm_params' in config:
        kernel = int(config['svm_params']['kernel']) if kernel <= 4 else 0
    call(['/export/tools/varvara/svm_multiclass/svm_light/svm_learn', '-t', str(kernel), train_file_name, model_name])
    logger.info("Training completed, start testing")
    test_file = os.path.join(persist_dir, 'out')
#    inverse_test_file = os.path.join(persist_dir, 'out_inv')
    call(['/export/tools/varvara/svm_multiclass/svm_light/svm_classify', '-f', '0', test_file_name, model_name, test_file])
#    call(['/export/tools/varvara/svm_multiclass/svm_light/svm_classify', '-f', '0', inv_test_file_name, model_name, inverse_test_file])
    logger.info("Testing completed")
#    predicted = get_test_score(test_file, inverse_test_file)
    predicted = get_test_score_blind(test_file)
    tag_map = {'OK': 1, 'BAD': 0}
    test_tags_num = [tag_map[t] for t in test_tags]
    logger.info(f1_score(predicted, test_tags_num, average=None))
    logger.info(f1_score(predicted, test_tags_num, average='weighted', pos_label=None))
Exemple #16
0
def main(config):
    workers = config['workers']
    tmp_dir = config['tmp_dir']
    tmp_dir = mk_tmp_dir(tmp_dir)

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    dev, test = False, False
    # training
    if 'training' in config['datasets']:
        train_data_generator = build_object(config['datasets']['training'][0])
        train_data = train_data_generator.generate()
    # test
    if 'test' in config['datasets']:
        test = True
        test_data_generator = build_object(config['datasets']['test'][0])
        test_data = test_data_generator.generate()
    # dev
    if 'dev' in config['datasets']:
        dev = True
        dev_data_generator = build_object(config['datasets']['dev'][0])
        dev_data = dev_data_generator.generate()
    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        if test:
            test_data = r.generate(test_data)
        if dev:
            dev_data = r.generate(dev_data)

    logger.info("Simple representations: {}".format(len(train_data['target'])))
    logger.info('here are the keys in your representations: {}'.format(
        train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = 'sequential'

    bad_tagging = config['bad_tagging']
    tags_format = config['tags_format'] if 'tags_format' in config else 'word'
    train_contexts = create_contexts_ngram(train_data,
                                           data_type=data_type,
                                           test=False,
                                           bad_tagging=bad_tagging,
                                           unambiguous=config['unambiguous'],
                                           tags_format=tags_format)
    if test:
        test_contexts = create_contexts_ngram(
            test_data,
            data_type=data_type,
            test=True,
            bad_tagging=bad_tagging,
            unambiguous=config['unambiguous'],
            tags_format=tags_format)
    if dev:
        dev_contexts = create_contexts_ngram(dev_data,
                                             data_type=data_type,
                                             test=True,
                                             bad_tagging=bad_tagging,
                                             unambiguous=config['unambiguous'],
                                             tags_format=tags_format)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'],
                                    test_data['target']]))

    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts,
                                       tags_from_contexts,
                                       data_type=data_type)
    if test:
        test_tags = call_for_each_element(test_contexts,
                                          tags_from_contexts,
                                          data_type=data_type)
    if dev:
        dev_tags = call_for_each_element(dev_contexts,
                                         tags_from_contexts,
                                         data_type=data_type)

    # word-level tags and phrase lengths
    if test:
        test_phrase_lengths = [
            get_contexts_words_number(cont) for cont in test_contexts
        ]
    if dev:
        dev_phrase_lengths = [
            get_contexts_words_number(cont) for cont in dev_contexts
        ]

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    if test:
        logger.info(
            'mapping the feature extractors over the contexts for test...')
        test_features = call_for_each_element(test_contexts,
                                              contexts_to_features,
                                              [feature_extractors, workers],
                                              data_type=data_type)
    if dev:
        logger.info(
            'mapping the feature extractors over the contexts for dev...')
        dev_features = call_for_each_element(dev_contexts,
                                             contexts_to_features,
                                             [feature_extractors, workers],
                                             data_type=data_type)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts,
                                           contexts_to_features,
                                           [feature_extractors, workers],
                                           data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info(
        'All of your features now exist in their raw representation, but they may not be numbers yet'
    )
    # END FEATURE EXTRACTION

    # persisting features
    logger.info('training and test sets successfully generated')

    experiment_datasets = [{
        'name': 'train',
        'features': train_features,
        'tags': train_tags,
        'phrase_lengths': None
    }]
    if test:
        experiment_datasets.append({
            'name': 'test',
            'features': test_features,
            'tags': test_tags,
            'phrase_lengths': test_phrase_lengths
        })
    if dev:
        experiment_datasets.append({
            'name': 'dev',
            'features': dev_features,
            'tags': dev_tags,
            'phrase_lengths': dev_phrase_lengths
        })
    feature_names = [
        f for extractor in feature_extractors
        for f in extractor.get_feature_names()
    ]

    persist_dir = config['persist_dir'] if 'persist_dir' in config else tmp_dir
    persist_dir = mk_tmp_dir(persist_dir)
    persist_format = config['persist_format']
    logger.info('persisting your features to: {}'.format(persist_dir))
    # for each dataset, write a file and persist the features
    for dataset_obj in experiment_datasets:
        persist_features(dataset_obj['name'],
                         dataset_obj['features'],
                         persist_dir,
                         feature_names=feature_names,
                         phrase_lengths=dataset_obj['phrase_lengths'],
                         tags=None,
                         file_format=persist_format)
    # generate a template for CRF++ feature extractor
    feature_num = len(feature_names)
    if persist_format == 'crf++':
        generate_crf_template(feature_num, 'template', persist_dir)

    logger.info('Features persisted to: {}'.format(', '.join([
        os.path.join(persist_dir, nn)
        for nn in [obj['name'] for obj in experiment_datasets]
    ])))
Exemple #17
0
def main(config):
    workers = config['workers']
    tmp_dir = config['tmp_dir'] if 'tmp_dir' in config else None
    tmp_dir = mk_tmp_dir(tmp_dir)
    time_stamp = str(time.time())

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    train_data_generators = build_objects(config['datasets']['training'])
    train_data = {}
    for gen in train_data_generators:
        data = gen.generate()
        for key in data:
            if key not in train_data:
                train_data[key] = []
            train_data[key].extend(data[key])
    # test
    test_data_generator = build_object(config['datasets']['test'][0])
    test_data = test_data_generator.generate()

    logger.info("Train data keys: {}".format(train_data.keys()))
    logger.info("Train data sequences: {}".format(len(train_data['target'])))
    logger.info("Sample sequence: {}".format(
        [w.encode('utf-8') for w in train_data['target'][0]]))
    #    logger.info("Sample sequence: {}".format(train_data['similarity'][0]))
    #    sys.exit()

    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        test_data = r.generate(test_data)

#    borders = config['borders'] if 'borders' in config else False

#    if 'multiply_data_train' not in config:
#        pass
#    elif config['multiply_data_train'] == 'ngrams':
#        train_data = multiply_data_ngrams(train_data, borders=borders)
#    elif config['multiply_data_train'] == '1ton':
#        train_data = multiply_data(train_data, borders=borders)
#    elif config['multiply_data_train'] == 'duplicate':
#        train_data = multiply_data_base(train_data)
#    elif config['multiply_data_train'] == 'all':
#        train_data = multiply_data_all(train_data, borders=borders)
#    else:
#        print("Unknown 'multiply data train' value: {}".format(config['multiply_data_train']))
#    logger.info("Extended train representations: {}".format(len(train_data['target'])))
#    logger.info("Simple test representations: {}".format(len(test_data['target'])))
#    if 'multiply_data_test' not in config:
#        pass
#    elif config['multiply_data_test'] == 'ngrams':
#        test_data = multiply_data_ngrams(test_data, borders=borders)
#    elif config['multiply_data_test'] == '1ton':
#        test_data = multiply_data(test_data, borders=borders)
#    else:
#        print("Unknown 'multiply data test' value: {}".format(config['multiply_data_test']))
#    logger.info("Extended test representations: {}".format(len(test_data['target'])))

    logger.info('here are the keys in your representations: {}'.format(
        train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['contexts'] if 'contexts' in config else 'plain'

    test_contexts = create_contexts(test_data, data_type=data_type)
    test_contexts_seq = create_contexts(test_data, data_type='sequential')
    train_contexts = create_contexts(train_data, data_type=data_type)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'],
                                    test_data['target']]))

    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts,
                                       tags_from_contexts,
                                       data_type=data_type)
    test_tags = call_for_each_element(test_contexts,
                                      tags_from_contexts,
                                      data_type=data_type)
    test_tags_seq = call_for_each_element(test_contexts_seq,
                                          tags_from_contexts,
                                          data_type='sequential')

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    logger.info('mapping the feature extractors over the contexts for test...')
    test_features = call_for_each_element(test_contexts,
                                          contexts_to_features,
                                          [feature_extractors, workers],
                                          data_type=data_type)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts,
                                           contexts_to_features,
                                           [feature_extractors, workers],
                                           data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info(
        'All of your features now exist in their raw representation, but they may not be numbers yet'
    )
    # END FEATURE EXTRACTION

    # BEGIN CONVERTING FEATURES TO NUMBERS
    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        feature_names = [
            f for extractor in feature_extractors
            for f in extractor.get_feature_names()
        ]
        features_num = len(feature_names)
        true_features_num = len(all_values[0])

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features,
                                              binarize, [binarizers],
                                              data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features,
                                               binarize, [binarizers],
                                               data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')
    logger.info('training and test sets successfully generated')

    # the way that we persist depends upon the structure of the data (plain/sequence/token_dict)
    # TODO: remove this once we have a list containing all datasets
    if config['features']['persist']:
        if 'persist_format' in config['features']:
            persist_format = config['features']['persist_format']
        else:
            persist_format = 'crf++'
        experiment_datasets = [{
            'name': 'test',
            'features': test_features,
            'tags': test_tags
        }, {
            'name': 'train',
            'features': train_features,
            'tags': train_tags
        }]
        feature_names = [
            f for extractor in feature_extractors
            for f in extractor.get_feature_names()
        ]

        if config['features']['persist_dir']:
            persist_dir = config['features']['persist_dir']
        else:
            persist_dir = os.path.getcwd()
        logger.info('persisting your features to: {}'.format(persist_dir))
        # for each dataset, write a file and persist the features
        for dataset_obj in experiment_datasets:
            persist_features(dataset_obj['name'],
                             dataset_obj['features'],
                             persist_dir,
                             feature_names=feature_names,
                             tags=dataset_obj['tags'],
                             file_format=persist_format)

    # BEGIN LEARNING

    # TODO: different sequence learning modules need different representation, we should wrap them in a class
    # TODO: create a consistent interface to sequence learners, will need to use *args and **kwargs because APIs are very different
    from sklearn.metrics import f1_score, precision_score, recall_score
    import numpy as np

    experiment_datasets = [{
        'name': 'test',
        'features': test_features,
        'tags': test_tags
    }, {
        'name': 'train',
        'features': train_features,
        'tags': train_tags
    }]
    feature_names = [
        f for extractor in feature_extractors
        for f in extractor.get_feature_names()
    ]

    print("FEATURE NAMES: ", feature_names)
    persist_dir = tmp_dir
    logger.info('persisting your features to: {}'.format(persist_dir))
    # for each dataset, write a file and persist the features
    if 'persist_format' not in config:
        config['persist_format'] = 'crf_suite'
    for dataset_obj in experiment_datasets:
        persist_features(dataset_obj['name'] + time_stamp,
                         dataset_obj['features'],
                         persist_dir,
                         feature_names=feature_names,
                         tags=dataset_obj['tags'],
                         file_format=config['persist_format'])

    feature_num = len(train_features[0][0])
    train_file = os.path.join(tmp_dir, 'train' + time_stamp + '.crf')
    test_file = os.path.join(tmp_dir, 'test' + time_stamp + '.crf')

    tag_map = {u'OK': 1, u'BAD': 0, 0: 0, 1: 1}
    if config['persist_format'] == 'crf++':
        # generate a template for CRF++ feature extractor
        generate_crf_template(feature_num, 'template', tmp_dir)
        # train a CRF++ model
        call([
            'crf_learn', '-a', 'MIRA',
            os.path.join(tmp_dir, 'template'), train_file,
            os.path.join(tmp_dir, 'crfpp_model_file' + time_stamp)
        ])
        # tag a test set
        call([
            'crf_test', '-m',
            os.path.join(tmp_dir, 'crfpp_model_file' + time_stamp), '-o',
            test_file + '.tagged', test_file
        ])
    elif config['persist_format'] == 'crf_suite':
        crfsuite_algorithm = config['crfsuite_algorithm']
        call([
            'crfsuite', 'learn', '-a', crfsuite_algorithm, '-m',
            os.path.join(tmp_dir, 'crfsuite_model_file' + time_stamp),
            train_file
        ])
        test_out = open(test_file + '.tagged', 'w')
        call([
            'crfsuite', 'tag', '-tr', '-m',
            os.path.join(tmp_dir, 'crfsuite_model_file' + time_stamp),
            test_file
        ],
             stdout=test_out)
        test_out.close()
    else:
        print("Unknown persist format: {}".format(config['persist_format']))

    # parse CRFSuite output
    flattened_ref, flattened_hyp = [], []
    tag_map = {'OK': 1, 'BAD': 0}
    for line in open(test_file + '.tagged'):
        if line == "\n":
            continue
        chunks = line.strip('\n').split('\t')
        if len(chunks) != 2:
            continue
        try:
            flattened_ref.append(tag_map[chunks[-2]])
            flattened_hyp.append(tag_map[chunks[-1]])
        except KeyError:
            continue

    print("Ref, hyp: ", len(flattened_ref), len(flattened_hyp))
    logger.info('Structured prediction f1: ')
    print(f1_score(flattened_ref, flattened_hyp, average=None))
    print(
        f1_score(flattened_ref,
                 flattened_hyp,
                 average='weighted',
                 pos_label=None))
    logger.info("Sequence correlation: ")
def main(config):
    workers = config['workers']
    tmp_dir = config['tmp_dir']
    tmp_dir = mk_tmp_dir(tmp_dir)

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    # training
    #    train_data_generators = build_objects(config['datasets']['training'])
    #    train_data = {}
    #    for gen in train_data_generators:
    #        data = gen.generate()
    #        for key in data:
    #            if key not in train_data:
    #                train_data[key] = []
    #            train_data[key].extend(data[key])
    train_data_generator = build_object(config['datasets']['training'][0])
    train_data = train_data_generator.generate()
    dev, test = False, False
    # test
    if 'test' in config['datasets']:
        test = True
        test_data_generator = build_object(config['datasets']['test'][0])
        test_data = test_data_generator.generate()

    # dev
    if 'dev' in config['datasets']:
        dev = True
        dev_data_generator = build_object(config['datasets']['dev'][0])
        dev_data = dev_data_generator.generate()
    # additional representations


#    print("IN MAIN")
#    print(train_data['alignments_file'])
#    print(dev_data['alignments_file'])
#    print(test_data['alignments_file'])
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        if test:
            test_data = r.generate(test_data)
        if dev:
            dev_data = r.generate(dev_data)

    print("TEST DATA", test_data['alignments'][0])
    logger.info("Simple representations: {}".format(len(train_data['target'])))
    logger.info('here are the keys in your representations: {}'.format(
        train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = config['contexts']
    print("DATA TYPE:", data_type)
    #    sys.exit()

    train_contexts = create_contexts(train_data, data_type=data_type)
    if test:
        test_contexts = create_contexts(test_data, data_type=data_type)
        logger.info('Vocabulary comparison -- coverage for test dataset: ')
        logger.info(
            compare_vocabulary([train_data['target'], test_data['target']]))
    if dev:
        dev_contexts = create_contexts(dev_data, data_type=data_type)

    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts,
                                       tags_from_contexts,
                                       data_type=data_type)
    if test:
        test_tags = call_for_each_element(test_contexts,
                                          tags_from_contexts,
                                          data_type=data_type)
    if dev:
        dev_tags = call_for_each_element(dev_contexts,
                                         tags_from_contexts,
                                         data_type=data_type)

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    if test:
        logger.info(
            'mapping the feature extractors over the contexts for test...')
        test_features = call_for_each_element(test_contexts,
                                              contexts_to_features,
                                              [feature_extractors, workers],
                                              data_type=data_type)
        print("Test features sample: ", test_features[0])
    if dev:
        logger.info(
            'mapping the feature extractors over the contexts for dev...')
        dev_features = call_for_each_element(dev_contexts,
                                             contexts_to_features,
                                             [feature_extractors, workers],
                                             data_type=data_type)
    logger.info(
        'mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts,
                                           contexts_to_features,
                                           [feature_extractors, 1],
                                           data_type=data_type)
    print("Train features sample: ", train_features[0])

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info(
        'All of your features now exist in their raw representation, but they may not be numbers yet'
    )
    # END FEATURE EXTRACTION

    # binarizing features
    logger.info('binarization flag: {}'.format(config['features']['binarize']))
    # flatten so that we can properly binarize the features
    if config['features']['binarize'] is True:
        logger.info('Binarizing your features...')
        all_values = []
        if data_type == 'sequential':
            all_values = flatten(train_features)
        elif data_type == 'plain':
            all_values = train_features
        elif data_type == 'token':
            all_values = flatten(train_features.values())

        feature_names = [
            f for extractor in feature_extractors
            for f in extractor.get_feature_names()
        ]
        features_num = len(feature_names)
        true_features_num = len(all_values[0])

        logger.info('fitting binarizers...')
        binarizers = fit_binarizers(all_values)
        logger.info('binarizing test data...')
        test_features = call_for_each_element(test_features,
                                              binarize, [binarizers],
                                              data_type=data_type)
        logger.info('binarizing training data...')
        # TODO: this line hangs with alignment+w2v
        train_features = call_for_each_element(train_features,
                                               binarize, [binarizers],
                                               data_type=data_type)

        logger.info('All of your features are now scalars in numpy arrays')
        logger.info('training and test sets successfully generated')

    # persisting features
    logger.info('training and test sets successfully generated')

    experiment_datasets = [{
        'name': 'train',
        'features': train_features,
        'tags': train_tags
    }]
    if test:
        experiment_datasets.append({
            'name': 'test',
            'features': test_features,
            'tags': test_tags
        })
    if dev:
        experiment_datasets.append({
            'name': 'dev',
            'features': dev_features,
            'tags': dev_tags
        })
    feature_names = [
        f for extractor in feature_extractors
        for f in extractor.get_feature_names()
    ]

    persist_dir = config['persist_dir'] if 'persist_dir' in config else config[
        'features']['persist_dir']
    persist_dir = mk_tmp_dir(persist_dir)
    persist_format = config[
        'persist_format'] if 'persist_format' in config else config[
            'features']['persist_format']
    logger.info('persisting your features to: {}'.format(persist_dir))
    # for each dataset, write a file and persist the features
    for dataset_obj in experiment_datasets:
        #        persist_features(dataset_obj['name'], dataset_obj['features'], persist_dir, feature_names=feature_names, tags=dataset_obj['tags'], file_format=persist_format)
        persist_features(dataset_obj['name'],
                         dataset_obj['features'],
                         persist_dir,
                         feature_names=feature_names,
                         tags=None,
                         file_format=persist_format)
    # generate a template for CRF++ feature extractor
    feature_num = len(feature_names)
    if persist_format == 'crf++':
        generate_crf_template(feature_num, 'template', persist_dir)

    logger.info('Features persisted to: {}'.format(', '.join([
        os.path.join(persist_dir, nn)
        for nn in [obj['name'] for obj in experiment_datasets]
    ])))
def main(config):
    workers = config['workers']
    tmp_dir = config['tmp_dir']
    tmp_dir = mk_tmp_dir(tmp_dir)

    # REPRESENTATION GENERATION
    # main representations (source, target, tags)
    dev, test = False, False
    # training
    if 'training' in config['datasets']:
        train_data_generator = build_object(config['datasets']['training'][0])
        train_data = train_data_generator.generate()
    # test
    if 'test' in config['datasets']:
        test = True
        test_data_generator = build_object(config['datasets']['test'][0])
        test_data = test_data_generator.generate()
    # dev
    if 'dev' in config['datasets']:
        dev = True
        dev_data_generator = build_object(config['datasets']['dev'][0])
        dev_data = dev_data_generator.generate()
    # additional representations
    if 'representations' in config:
        representation_generators = build_objects(config['representations'])
    else:
        representation_generators = []
    for r in representation_generators:
        train_data = r.generate(train_data)
        if test:
            test_data = r.generate(test_data)
        if dev:
            dev_data = r.generate(dev_data)

    logger.info("Simple representations: {}".format(len(train_data['target'])))
    logger.info('here are the keys in your representations: {}'.format(train_data.keys()))

    # the data_type is the format corresponding to the model of the data that the user wishes to learn
    data_type = 'sequential'

    bad_tagging = config['bad_tagging']
    tags_format = config['tags_format'] if 'tags_format' in config else 'word'
    train_contexts = create_contexts_ngram(train_data, data_type=data_type, test=False, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format)
    if test:
        test_contexts = create_contexts_ngram(test_data, data_type=data_type, test=True, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format)
    if dev:
        dev_contexts = create_contexts_ngram(dev_data, data_type=data_type, test=True, bad_tagging=bad_tagging, unambiguous=config['unambiguous'], tags_format=tags_format)

    logger.info('Vocabulary comparison -- coverage for each dataset: ')
    logger.info(compare_vocabulary([train_data['target'], test_data['target']]))

    # END REPRESENTATION GENERATION

    # FEATURE EXTRACTION
    train_tags = call_for_each_element(train_contexts, tags_from_contexts, data_type=data_type)
    if test:
        test_tags = call_for_each_element(test_contexts, tags_from_contexts, data_type=data_type)
    if dev:
        dev_tags = call_for_each_element(dev_contexts, tags_from_contexts, data_type=data_type)

    # word-level tags and phrase lengths
    if test:
        test_phrase_lengths = [get_contexts_words_number(cont) for cont in test_contexts]
    if dev:
        dev_phrase_lengths = [get_contexts_words_number(cont) for cont in dev_contexts]

    logger.info('creating feature extractors...')
    feature_extractors = build_objects(config['feature_extractors'])
    if test:
        logger.info('mapping the feature extractors over the contexts for test...')
        test_features = call_for_each_element(test_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    if dev:
        logger.info('mapping the feature extractors over the contexts for dev...')
        dev_features = call_for_each_element(dev_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)
    logger.info('mapping the feature extractors over the contexts for train...')
    train_features = call_for_each_element(train_contexts, contexts_to_features, [feature_extractors, workers], data_type=data_type)

    logger.info('number of training instances: {}'.format(len(train_features)))
    logger.info('number of testing instances: {}'.format(len(test_features)))

    logger.info('All of your features now exist in their raw representation, but they may not be numbers yet')
    # END FEATURE EXTRACTION

    # persisting features
    logger.info('training and test sets successfully generated')

    experiment_datasets = [{'name': 'train', 'features': train_features, 'tags': train_tags, 'phrase_lengths': None}]
    if test:
        experiment_datasets.append({'name': 'test', 'features': test_features, 'tags': test_tags, 'phrase_lengths': test_phrase_lengths})
    if dev:
        experiment_datasets.append({'name': 'dev', 'features': dev_features, 'tags': dev_tags, 'phrase_lengths': dev_phrase_lengths})
    feature_names = [f for extractor in feature_extractors for f in extractor.get_feature_names()]

    persist_dir = config['persist_dir'] if 'persist_dir' in config else tmp_dir
    persist_dir = mk_tmp_dir(persist_dir)
    persist_format = config['persist_format']
    logger.info('persisting your features to: {}'.format(persist_dir))
    # for each dataset, write a file and persist the features
    for dataset_obj in experiment_datasets:
        persist_features(dataset_obj['name'],
                         dataset_obj['features'],
                         persist_dir,
                         feature_names=feature_names,
                         phrase_lengths=dataset_obj['phrase_lengths'],
                         tags=dataset_obj['tags'],
                         file_format=persist_format)
    # generate a template for CRF++ feature extractor
    feature_num = len(feature_names)
    if persist_format == 'crf++':
        generate_crf_template(feature_num, 'template', persist_dir)

    logger.info('Features persisted to: {}'.format(', '.join([os.path.join(persist_dir, nn) for nn in [obj['name'] for obj in experiment_datasets]])))
    def __init__(self, tagger, parameters, data_label, tmp_dir=None):
        self.tmp_dir = mk_tmp_dir(tmp_dir)

        self.tagger = tagger
        self.parameters = parameters
        self.data = data_label
Exemple #21
0
    parser.add_argument("method", help="crf_suite | crfpp | svm_light")
    parser.add_argument("representation", help="sequence | plain")
    parser.add_argument("--params", default='', help="training params, string")
    parser.add_argument("--test_params",
                        default='',
                        help="test params, string")
    parser.add_argument("--tmp",
                        default=None,
                        action="store",
                        help="temporary directory")
    args = parser.parse_args()

    tmp_dir = args.tmp if args.tmp is not None else os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'tmp_dir')
    tmp_dir = os.path.abspath(tmp_dir)
    tmp_dir = mk_tmp_dir(tmp_dir)
    stamp = args.method
    if args.params != '':
        stamp += ('.' + args.params.replace(' ', '_'))
    print("Stamp: ", stamp)
    if args.representation == 'sequence':
        sequence = True
    elif args.representation == 'plain':
        sequence = False
    else:
        print("Unknown representation: {}".format(args.representation))

    if args.method == 'crf_suite':
        model = os.path.join(tmp_dir, 'crfsuite_model_file' + stamp)
        test_tags = get_test_tags(args.test_file)
        call(['crfsuite', 'learn'] + args.params.split() +
Exemple #22
0
    def __init__(self, tagger, parameters, data_label, tmp_dir=None):
        self.tmp_dir = mk_tmp_dir(tmp_dir)

        self.tagger = tagger
        self.parameters = parameters
        self.data = data_label