def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt):

    # Start Training
    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999)))

        # documentation: http://www.chokkan.org/software/crfsuite/manual.html
        training_opt = {"feature.possible_states": False,
                        "feature.possible_transitions": False,
                        "c2": 2.0
                        }
        model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt)
        model.train(td, model_filename)

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))
        # Delete model file now predictions obtained
        # Note, we are randomizing name above, so we need to clean up here
        os.remove(model_filename)

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions)
    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold,
                            code_freq, training_opt):

    # Start Training
    # Start Training
    print("Fold %i Training code" % fold)

    # For training
    td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags,
                                                    code_freq)
    vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags,
                                                    code_freq)

    model_filename = models_folder + "/" + "%i_%s__%s" % (
        fold, "most_freq_code", str(randint(0, 9999999)))

    model = CRFTagger(feature_func=comp_feat_extactor,
                      verbose=False,
                      training_opt=training_opt)
    model.train(td_sents, model_filename)

    td_predictions = model.tag_sents(to_sentences(td_sents))
    vd_predictions = model.tag_sents(to_sentences(vd_sents))

    # for evaluation - binary tags
    # YS (ACTUAL)
    td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags)
    vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags)

    wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset,
                                                      regular_tags)
    wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset,
                                                      regular_tags)

    # YS (PREDICTED)
    td_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        td_predictions, regular_tags)
    vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(
        vd_predictions, regular_tags)
    os.remove(model_filename)

    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Beispiel #4
0
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold):
    td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags)
    vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags)

    wd_td_ys_bytag = dict()
    wd_vd_ys_bytag = dict()
    td_wd_predictions_by_code = dict()
    vd_wd_predictions_by_code = dict()

    for code in sorted(regular_tags):
        print("Fold %i Training code: %s" % (fold, code))
        td, vd = td_sents_by_code[code], vd_sents_by_code[code]

        model_filename = models_folder + "/" + "%i_%s__%s" % (
            fold, code, str(randint(0, 9999999)))

        # documentation: http://www.chokkan.org/software/crfsuite/manual.html
        training_opt = {
            "feature.possible_states": False,
            "feature.possible_transitions": False,
            "c2": 2.0
        }
        model = CRFTagger(feature_func=comp_feat_extactor,
                          verbose=False,
                          training_opt=training_opt)
        model.train(td, model_filename)

        wd_td_ys_bytag[code] = to_flattened_binary_tags(td)
        wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd)

        td_predictions = model.tag_sents(to_sentences(td))
        vd_predictions = model.tag_sents(to_sentences(vd))
        # Delete model file now predictions obtained
        # Note, we are randomizing name above, so we need to clean up here
        os.remove(model_filename)

        td_wd_predictions_by_code[code] = to_flattened_binary_tags(
            td_predictions)
        vd_wd_predictions_by_code[code] = to_flattened_binary_tags(
            vd_predictions)
    return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
Beispiel #5
0
 def demo(self, test_sents):
     tagger = CRFTagger(feature_func=self.feature_detector)
     tagger.set_model_file(self.modelpath)
     for sent in test_sents:
         tagged = tagger.tag(untag(sent))
         for s in self._to_sentence(tagged):
             print(s)
     print(tagger.evaluate(test_sents))
class CRF:
    def __init__(self, config):
        self.word_ftrs = config['word_ftrs']
        for ftr in self.word_ftrs:
            if ftr not in CRF.WORD_FTRS:
                raise Exception(
                    'Unknown feature {}. See CRF.WORD_FTRS for supported ones.'
                    .format(CRF.WORD_FTRS))
        self.stc_ftrs = config['stc_ftrs']
        for ftr in self.stc_ftrs:
            if ftr not in CRF.STC_FTRS:
                raise Exception(
                    'Unknown feature {}. See CRF.STC_FTRS for supported ones.'.
                    format(CRF.STC_FTRS))
        self.words_ids = config['extr_word_idx']

    def prep_data(self, file='data/HaaretzOrnan_annotated.txt'):
        self.data = []
        with codecs.open(file, encoding='utf-8') as f:
            lines = f.readlines()
            self.data.append([])
            for line in lines:
                line = line.rstrip()
                # Start new sentence
                if line.startswith(u'#'):
                    continue
                if len(line) == 0:
                    if len(self.data[-1]) > 0:
                        self.data.append([])
                    continue
                # Append word to last sentence
                w = line.split(u' ')[3]
                w = w.replace(u'-', u'')
                self.data[-1].append(w)

        # If sentence is empty - remove it
        if len(self.data[-1]) == 0:
            self.data.remove(self.data[-1])
        return self

    def shuffle(self, seed=None):
        # Shuffle based on seed
        inds = np.arange(len(self.data))
        np.random.seed(seed)
        np.random.shuffle(inds)
        self.data = [self.data[i] for i in inds]
        return self

    def split(self, valid_ratio=0.1):
        # Split to train and validation based on ratio.
        # If ratio is 0 use all data for training
        num_train = int(len(self.data) * (1 - valid_ratio))
        self.train_set = self.data[:num_train]
        self.valid_set = None if valid_ratio == 0 else self.data[num_train:]
        return self

    def train(self, load_model=None):
        train_set = CRF._fin_data_prep(self.train_set)
        _extract_ftr = self._gen_ftr_func()
        self.model = CRFTagger(_extract_ftr,
                               verbose=False,
                               training_opt={
                                   "num_memories": 500,
                                   "delta": 1e-8
                               })
        self.model.train(train_set, 'stc_crf_model')
        return self

    def eval(self):
        conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS)))
        valid_set = CRF._fin_data_prep(self.valid_set)
        valid_stc_cons = [[x[0] for x in w] for w in valid_set]
        valid_stc_vowel = [[x[1] for x in w] for w in valid_set]
        predicted = self.model.tag_sents(valid_stc_cons)
        predicted = [[x[1] for x in w] for w in predicted]
        for w_ind in range(len(predicted)):
            for vow_ind, pred_vow in enumerate(predicted[w_ind]):
                conf_mat[self.VOWELS_IDX[pred_vow],
                         self.VOWELS_IDX[valid_stc_vowel[w_ind][vow_ind]]] += 1
        return conf_mat

    def predict(self, pred_set):
        data = []
        for sent in pred_set:
            sent_cons = u' '.join(sent)
            for i, w in enumerate(sent):
                w_cons = list(w)
                w_pos = [i] * len(w)
                unif_sent = [sent_cons] * len(w)
                d = list(zip(w_cons, w_pos, unif_sent))
                data.append(d)
        pred = self.model.tag_sents(data)
        result = []
        word_idx = 0
        for sent in pred_set:
            result.append([])
            for word in sent:
                pred_smpl = pred[word_idx]
                w = ''.join([entry[0][0] + entry[-1] for entry in pred_smpl])
                result[-1].append(w)
                word_idx += 1
        return result

    @staticmethod
    def _fin_data_prep(data_set):
        data = []
        for sent in data_set:
            sent_cons = u' '.join([x[::2] for x in sent])
            for i, w in enumerate(sent):
                w_cons = list(w[::2])
                w_pos = [i] * len(w[::2])
                unif_sent = [sent_cons] * len(w[::2])
                d = list(zip(w_cons, w_pos, unif_sent))
                data.append(list(zip(d, list(w[1::2]))))
        return data

    @staticmethod
    def _len(x):
        return len(x) if isinstance(x, str) else int(x)

    VOWELS = [u'a', u'e', u'u', u'i', u'o', u'*']
    VOWELS_IDX = {x: i for i, x in enumerate(VOWELS)}
    WORD_FTRS = [
        'IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL',
        'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN'
    ]
    STC_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX']

    def _gen_ftr_func(self):
        # Closure
        def _extract_ftr(tokens, i):
            def _extract_wrd_ftr(tokens, i, suff):
                feature_list = []

                if i is not None:
                    if 'IS_FIRST' in self.word_ftrs:
                        feature_list.append("is_first{}={}".format(
                            suff, 1 if i == 0 else 0))

                    if 'IS_LAST' in self.word_ftrs:
                        feature_list.append("is_last{}={}".format(
                            suff, 1 if i == (len(tokens) - 1) else 0))

                    if 'IDX' in self.word_ftrs:
                        feature_list.append("pos{}={}".format(suff, i))

                    if 'VAL' in self.word_ftrs:
                        feature_list.append("cur{}={}".format(suff, tokens[i]))

                    if 'PRV_VAL' in self.word_ftrs:
                        if i > 0:
                            feature_list.append("prev{}={}".format(
                                suff, tokens[i - 1]))

                    if 'NXT_VAL' in self.word_ftrs:
                        if i < (len(tokens) - 1):
                            feature_list.append("next{}={}".format(
                                suff, tokens[i + 1]))

                if 'FRST_VAL' in self.word_ftrs:
                    feature_list.append("first{}={}".format(suff, tokens[0]))

                if 'LST_VAL' in self.word_ftrs:
                    feature_list.append("last{}={}".format(suff, tokens[-1]))

                if 'LEN' in self.word_ftrs:
                    feature_list.append("len{}={}".format(suff, len(tokens)))

                if 'SCND_VAL' in self.word_ftrs:
                    if len(tokens) > 1:
                        feature_list.append("scnd{}={}".format(
                            suff, tokens[1]))

                if 'SCND_LST_VAL' in self.word_ftrs:
                    if len(tokens) > 1:
                        feature_list.append("scnd_last{}={}".format(
                            suff, tokens[-2]))

                return feature_list

            feature_list = []
            word_pos = tokens[0][1]
            sent = tokens[0][2].split(' ')

            # Sentence features
            if 'IS_FIRST' in self.stc_ftrs:
                if word_pos == 0:
                    feature_list.append('FIRST_WORD')
            if 'IS_LAST' in self.stc_ftrs:
                if word_pos == (len(sent) - 1):
                    feature_list.append('LAST_WORD')
            if 'IDX' in self.stc_ftrs:
                feature_list.append("idx=" + str(word_pos))

            # word features
            for rel_pos in self.words_ids:
                word_pos = tokens[0][1] + rel_pos
                if word_pos >= 0 and word_pos < len(sent):
                    word = sent[word_pos]
                    feature_list += _extract_wrd_ftr(
                        word, i if rel_pos == 0 else None,
                        '_w{}'.format(rel_pos))

            return feature_list

        return _extract_ftr
Beispiel #7
0
class CRF:
    def __init__(self, config):
        self.ftrs = config['ftrs']
        for ftr in self.ftrs:
            if ftr not in CRF.WORD_FTRS:
                raise Exception('Unknown feature {}. See CRF.CONFIG for supported ones.'.format(CRF.WORD_FTRS))

    def prep_data(self, file='data/HaaretzOrnan_annotated.txt'):
        self.data = []
        # print('Preparing data')
        with codecs.open(file, encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                line = line.rstrip()
                if line.startswith(u'#') or len(line) == 0:
                    continue
                w = line.split(u' ')[3]
                w = w.replace(u'-', u'')
                self.data.append(list(zip(list(w[::2]), list(w[1::2]))))
        return self

    def shuffle(self, seed=None):
        # Shuffle based on seed
        inds = np.arange(len(self.data))
        np.random.seed(seed)
        np.random.shuffle(inds)
        self.data=[self.data[i] for i in inds]
        return self

    def split(self, valid_ratio=0.1):
        # Split to train and validation based on ratio.
        # If ratio is 0 use all data for training
        num_train = int(len(self.data)*(1-valid_ratio))
        self.train_set = self.data[:num_train]
        self.valid_set = None if valid_ratio==0 else self.data[num_train:]
        return self

    def train(self, load_model=None):
        _extract_ftr = self._gen_ftr_func()
        self.model = CRFTagger(_extract_ftr, verbose=False,
                       training_opt={"num_memories": 500, "delta": 1e-8})
        self.model.train(self.train_set, 'word_crf_model')
        return self

    def eval(self):
        conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS)))
        valid_word_cons = [[x[0] for x in w] for w in self.valid_set]
        valid_word_vowel = [[x[1] for x in w] for w in self.valid_set]
        predicted = self.model.tag_sents(valid_word_cons)
        predicted = [[x[1] for x in w] for w in predicted]
        for w_ind in range(len(predicted)):
            for vow_ind, pred_vow in enumerate(predicted[w_ind]):
                conf_mat[self.VOWELS_IDX[pred_vow], self.VOWELS_IDX[valid_word_vowel[w_ind][vow_ind]]] += 1
        return conf_mat

    def predict(self, pred_set):
        result = []
        for sent in pred_set:
            pred_sent = []
            predicted = self.model.tag_sents(sent)
            for i, w_cons in enumerate(predicted):
                pred_sent.append(''.join(x+y for x, y in w_cons))
            result.append(pred_sent)
        return result

    @staticmethod
    def _len(x):
        return len(x) if isinstance(x, str) else int(x)

    VOWELS = [u'a',u'e',u'u',u'i',u'o',u'*']
    VOWELS_IDX = {x:i for i,x in enumerate(VOWELS)}
    WORD_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN']

    def _gen_ftr_func(self):
        # Closure
        def _extract_ftr(tokens, i):
            # print(tokens, i, tokens[i])
            feature_list = []
            if 'IS_FIRST' in self.ftrs:
                feature_list.append("is_first="+str(1 if i == 0 else 0))

            if 'IS_LAST' in self.ftrs:
                feature_list.append("is_last="+str(1 if i == (len(tokens)-1) else 0))

            if 'IDX' in self.ftrs:
                feature_list.append("pos="+str(i))

            if 'VAL' in self.ftrs:
                feature_list.append("cur="+tokens[i])

            if 'PRV_VAL' in self.ftrs:
                if i > 0:
                    feature_list.append("prev="+tokens[i-1])

            if 'NXT_VAL' in self.ftrs:
                if i < (len(tokens)-1):
                    feature_list.append("next="+tokens[i+1])

            if 'FRST_VAL' in self.ftrs:
                feature_list.append("first="+tokens[0])

            if 'LST_VAL' in self.ftrs:
                feature_list.append("last="+tokens[-1])

            if 'LEN' in self.ftrs:
                feature_list.append("len="+str(len(tokens)))

            if 'SCND_VAL' in self.ftrs:
                if len(tokens)>1:
                    feature_list.append("scnd="+tokens[1])

            if 'SCND_LST_VAL' in self.ftrs:
                if len(tokens)>1:
                    feature_list.append("scnd_last="+tokens[-2])

            return feature_list
        return _extract_ftr
Beispiel #8
0
def main():
    aparser = argparse.ArgumentParser(description='Daba disambiguator')
    aparser.add_argument('-v',
                         '--verbose',
                         help='Verbose output',
                         default=False,
                         action='store_true')
    aparser.add_argument(
        '-l',
        '--learn',
        help='Learn model from data (and save as F if provided)',
        default=None)
    aparser.add_argument('-p',
                         '--pos',
                         help='Prediction for POS',
                         default=False,
                         action='store_true')
    aparser.add_argument('-t',
                         '--tone',
                         help='Prediction for tones',
                         default=False,
                         action='store_true')
    aparser.add_argument('-r', '--root', help='Corpus root dir')
    aparser.add_argument('-f',
                         '--filelist',
                         help='Path to a list of files to learn from')
    # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
    aparser.add_argument(
        '-e',
        '--evalsize',
        type=int,
        default=10,
        help=
        'Percent of training data with respect to training and test one (default 10)'
    )
    aparser.add_argument(
        '-d',
        '--disambiguate',
        help=
        'Use model F to disambiguate data, the gloss list will be ordered by the probability growth order',
        default=None)
    aparser.add_argument(
        '--select',
        help=
        'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.',
        action='store_true')
    aparser.add_argument('-i',
                         '--infile',
                         help='Input file (.html)',
                         default=sys.stdin)
    aparser.add_argument('-o',
                         '--outfile',
                         help='Output file (.html)',
                         default=sys.stdout)
    aparser.add_argument(
        '-s',
        '--store',
        help=
        'Store tagged raw data in file (.csv) for further research purpose',
        default=None)

    args = aparser.parse_args()
    if args.verbose:
        print(args)

    if args.learn and (args.pos or args.tone or args.gloss):

        if not (args.pos or args.tone or args.gloss):
            print('Choose pos, tone, gloss or combination of them')
            exit(0)

        print('Make list of files')
        allfiles = []
        with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist:
            for line in filelist:
                allfiles.append(line.strip())
        allsents = []

        # pour le débogage
        # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'

        if args.tone:
            try:
                enc = encoder_tones()
            except:
                enc = None
                print(("Error : unable to initialize the tone encoder !"))

        print('Open files and find features / supervision tags')
        for infile in allfiles:
            if (infile):
                print('-', infile)
                sent = []

                html_parser = FileParser()
                html_parser.read_file(os.path.join(args.root, infile))

                for snum, sentence in enumerate(html_parser.glosses):
                    for tnum, token in enumerate(sentence[2]):
                        tag = ''
                        if token.type == 'w' or token.type == 'c':
                            tags = ''
                            if args.pos:
                                tags = '/'.join(token.gloss.ps)
                                wordform = detone(token.gloss.form)
                                sent.append((wordform, tags))
                            elif args.tone:
                                # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ?
                                # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est
                                # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter
                                # une réélle amélioration dans la modélisation de tonalisation. Néanmoins,
                                # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans
                                # les données d'entraînement et d'en observer le apport
                                if '|' not in token.gloss.form:
                                    [codes, chunks] = enc.differential_encode(
                                        token.token, token.gloss.form)
                                    for chunk, code in zip(chunks, codes):
                                        try:
                                            sent.append((chunk, code))
                                        except LookupError:
                                            pass
                            """
                            elif args.gloss:
                                tags += token.gloss.gloss
                                sent.append((token.token, tags))
                            """

                    if len(sent) > 1:
                        allsents.append(sent)
                        sent = []

        if args.verbose and args.tone:
            enc.report()

        # Constitution des ensmebles d'entraînement de d'évaluation
        p = (1 - args.evalsize / 100.0)
        train_set, eval_set = sampling(allsents, p)
        print('Split the data in train (', len(train_set),
              ' sentences) / test (', len(eval_set), ' sentences)')

        print('Building classifier (CRF/NLTK)')
        # Initialization
        t1 = time.time()
        if args.tone:
            num_phases = len([False, True]) * len(mode_indicators)
            myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
        else:
            num_phases = 1

        # Training
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose,
                               training_opt={'feature.minfreq': 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
            else:
                model_name = args.learn

            # train_set : list(list((str,list(str))))
            for sent in train_set:
                tokens = unzip(sent)[0]
                labels = unzip(sent)[1]
                if num_phases > 1:
                    for lab in labels:
                        pass
                    labels = [
                        code_dispatcher(label)[phase] for label in labels
                    ]
                features = [
                    _get_features_customised_for_tones(tokens, i)
                    for i in range(len(tokens))
                ]
                trainer.append(features, labels)
            trainer.train(model=model_name)
            if num_phases > 1:
                myzip.write(model_name)
                os.remove(model_name)
        if num_phases > 1:
            myzip.close()

        print("... done in", get_duration(t1_secs=t1, t2_secs=time.time()))

        # Evaluation
        print('Evaluating classifier')
        # gold_set, predicted_set : list(list((str, str)))
        # input_set, output_gold_set : list(list(str))
        gold_set = eval_set
        input_set = [unzip(sent)[0] for sent in gold_set]
        predicted_set = [list() for sent in gold_set]
        if num_phases > 1:
            myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose,
                               training_opt={'feature.minfreq': 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
                myzip.extract(model_name)
            else:
                model_name = args.learn
            tagger.set_model_file(model_name)
            for i, sent in enumerate(input_set):
                features = [
                    _get_features_customised_for_tones(sent, j)
                    for j in range(len(sent))
                ]
                labels = tagger._tagger.tag(features)
                if num_phases > 1:
                    labels = [
                        code_dispatcher(label)[phase] for label in labels
                    ]
                tagged_sent = list(zip(sent, labels))
                if not predicted_set[i]:
                    predicted_set[i] = tagged_sent
                else:
                    sent_acc, labels_acc = unzip(predicted_set[i])
                    labels_acc = [
                        label_acc + label
                        for label_acc, label in zip(labels_acc, labels)
                    ]
                    predicted_set[i] = list(zip(sent_acc, labels_acc))
            if num_phases > 1:
                os.remove(model_name)
                myzip.close()

        # gold_tokens, predicted_tokens : list((str,str))
        predicted_tokens = list(itertools.chain(*predicted_set))
        if num_phases > 1:
            predicted_tokens = [
                tuple([pair[0], code_resort(pair[1])])
                for pair in predicted_tokens
            ]
        gold_tokens = list(itertools.chain(*gold_set))
        # gold_tokens_eval, predicted_tokens_eval : list(str)
        if args.tone:
            gold_tokens_eval = getTag(gold_tokens)
            predicted_tokens_eval = getTag(predicted_tokens)
        else:
            gold_tokens_eval = gold_tokens
            predicted_tokens_eval = predicted_tokens

        if args.store and args.tone:
            stored_filename = args.store
            csv_export(enc, stored_filename, gold_tokens, predicted_tokens)

        print("Accuracy : {:>5.3f}".format(
            accuracy(gold_tokens_eval, predicted_tokens_eval)))

        if args.verbose and args.store:
            print(("Tagged result is exported in {}".format(args.store)))

    elif args.disambiguate and args.infile and args.outfile:
        # Lecture de texte en .HTML
        html_parser = FileParser()
        tagger = CRFTagger()

        if args.pos:
            try:
                tagger.set_model_file(args.disambiguate)
            except IOError:
                print("Error : unable to open the model {} !".format(
                    args.infile))
                exit(1)
            try:
                html_parser.read_file(args.infile)
            except IOError:
                print("Error : unable to open the input file {} !".format(
                    args.infile))
                exit(1)

            # Exportation du résultat de désambiguïsation en .HTML
            for snum, sentence in enumerate(html_parser.glosses):
                tokens = [token.token for token in sentence[2]]
                features = [
                    _get_features_customised_for_tones(tokens, i)
                    for i in range(len(tokens))
                ]
                tagger._tagger.set(features)
                for tnum, token in enumerate(sentence[2]):
                    options = list()
                    if token.value and len(token.value) > 2:
                        for nopt, option in enumerate(token.value[2]):
                            try:
                                tag = option.ps[0]
                            except IndexError:
                                tag = ''
                            prob = tagger._tagger.marginal(tag, tnum)
                            options.append((prob, option))
                        reordered_probs, reordered_options = unzip(
                            sorted(options, reverse=True))
                        if args.select:
                            prob_max = reordered_probs[0]
                            reordered_options = tuple([
                                reordered_options[i]
                                for i, p in enumerate(reordered_probs)
                                if p >= prob_max
                            ])
                        html_parser.glosses[snum][1][tnum] = reordered_options

        elif args.tone:
            pass

        try:
            html_parser.write(args.outfile)
        except IOError:
            print("Error : unable to create the output file {}".format(
                args.outfile))

    else:
        aparser.print_help()
    exit(0)
Beispiel #9
0
def main():
	
	aparser = argparse.ArgumentParser(description='Daba disambiguator')
	# aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin")
	# aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout")
	aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
	aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
	aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
	aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
	aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10)
	aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
	args = aparser.parse_args()

	if args.learn:

		if not args.pos or args.tone or args.gloss:
			print 'Choose pos, tone, gloss or combination of them'
			exit(0)

		print 'Make list of files'
		files1 = glob.iglob("../corbama/*/*.dis.html")
		files2 = glob.iglob("../corbama/*.dis.html")
		allfiles = ""
		for file1, file2 in zip(files1, files2):
			allfiles += file1+','+file2+','
		allsents = []

		print 'Open files and find features / supervision tags'
		for infile in allfiles.split(','):
			if(len(infile)) :
				print '-', infile
				sent = []
				in_handler = formats.HtmlReader(infile, compatibility_mode=False)
				for token in in_handler:
					tag = ''
					if token.type == 'w' or token.type == 'c':
						tags = ''
						if args.pos:
							for ps in token.gloss.ps:
								tags += ps
						if args.tone:
							tags += token.gloss.form.encode('utf-8')
						if args.gloss:
							tags += token.gloss.gloss.encode('utf-8')
						sent.append((token.token, tags))
					if token.type == 'c' and token.token in ['.', '?', '!']:
						if len(sent) > 1:
							allsents.append(sent)
						sent = []

		datalength = len(allsents)
		p = (1-args.evalsize/100.0)
		print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)'
		random.seed(123456)
		random.shuffle(allsents)
		train_set = allsents[:int(p*datalength)]
		test_set = allsents[int(p*datalength):datalength]

		print 'Building classifier (CRF/NLTK)'
		tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
		t1 = time.time()
		tagger.train(train_set, args.learn)
		t2 = time.time()
		texec = t2-t1
		print "... done in",  time.strftime('%H %M %S', time.localtime(texec))

		print 'Evaluating classifier'
		print tagger.evaluate(test_set)

		if args.verbose:
			print 'Compute detailed output'

	else:
		print 'USE...'
		parser.print_help()

	exit(0)
Beispiel #10
0
 def train(self, load_model=None):
     _extract_ftr = self._gen_ftr_func()
     self.model = CRFTagger(_extract_ftr, verbose=False,
                    training_opt={"num_memories": 500, "delta": 1e-8})
     self.model.train(self.train_set, 'word_crf_model')
     return self
Beispiel #11
0
def main():

    aparser = argparse.ArgumentParser(
        description=u'Tonalizer - CRF-based Tone Reconstitution Tool')
    aparser.add_argument('-v',
                         '--verbose',
                         help='Verbose output',
                         default=False,
                         action='store_true')
    aparser.add_argument(
        '-l',
        '--learn',
        help='Learn model from diacritized text (and save as file if provided)',
        default=None,
        type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument(
        '-e',
        '--evalsize',
        help=
        'Percent of training data with respect to training and test one (default 10)',
        default=10,
        type=float)
    #aparser.add_argument('-c', '--chunkmode', help='Word segmentation width (default 3)', default=3, type=int)
    aparser.add_argument('-d',
                         '--diacritize',
                         help='Use model file to diacritize a raw text',
                         default=None)
    aparser.add_argument('-u',
                         '--undiacritize',
                         help='Undiacritize a raw text',
                         default=False,
                         action='store_true')
    aparser.add_argument('-f',
                         '--filtering',
                         help='Keep only one insertion for one poistion',
                         default=False,
                         action='store_true')
    aparser.add_argument('-m',
                         '--markers',
                         help='Custumed set of markers to learn',
                         default=None,
                         type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument('-i',
                         '--infile',
                         help='Input file (.txt)',
                         default=sys.stdin,
                         type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument('-o',
                         '--outfile',
                         help='Output file (.txt)',
                         default=sys.stdout,
                         type=lambda s: unicode(s, 'utf8'))
    aparser.add_argument(
        '-s',
        '--store',
        help=
        'Store evaluation result in file (.csv), effective only in learning mode',
        default=None,
        type=lambda s: unicode(s, 'utf8'))
    args = aparser.parse_args()

    if not (args.learn or args.diacritize or args.undiacritize):
        print 'Error : choose -learn, -diacritize or -undiacritize !'
        aparser.print_help()
        exit(0)

    if args.verbose:
        print 'Arguments received by script'
        dico = vars(args)
        for key, val in dico.items():
            typeName = type(val).__name__
            sys.stdout.write(u"\t{} = {} ".format(key, val))
            if val:
                sys.stdout.write(u"({})".format(typeName))
            print ""

    if args.undiacritize:
        fr = fileReader.fileReader(args.markers)
        fr.read2(args.infile, args.outfile)

    elif args.learn:
        fr = fileReader.fileReader(args.markers)
        allsents = []
        print 'Making observation data from diacritized text'
        for sentence in fr.read(args.infile):
            sent = []
            for token in sentence:
                sent.append((token[0], token[1].encode('utf-8')))
            if len(sent) > 1:
                allsents.append(sent)

        print 'Word segmentation and diacritic informaiotn compression'
        enc = encoder_tones()
        allsents2 = allsents
        allsents = []
        for sent in allsents2:
            sent2 = []
            for token_tags in sent:
                token, tags = token_tags
                [codes,
                 syllabes] = enc.differential_encode(token,
                                                     tags.decode('utf-8'),
                                                     chunkmode)
                token2 = [(syllabe, code.encode('utf-8'))
                          for syllabe, code in zip(syllabes, codes)]
                sent2.append(token2)
            allsents.append(sent2)

        if args.verbose:
            enc.report()

        p = (1 - args.evalsize / 100.0)
        train_set, eval_set = sampling(allsents, p)
        print 'Split the data in train (', len(
            train_set), ' sentences) / test (', len(eval_set), ' sentences)'

        print 'Building classifier (pyCRFsuite)'
        # Initialization
        t1 = time.time()

        # A.1. Initialize a new CRF trainer
        tagger = CRFTagger(verbose=args.verbose,
                           training_opt={'feature.minfreq': 10})
        trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
        trainer.set_params(tagger._training_options)

        # A.2. Prepare training set
        for sent in train_set:
            [tokens, labels] = make_tokens_from_sentence(sent, True)
            features = make_features_from_tokens(tokens, True)
            labels = get_sub_tone_code_of_sentence(sent, sel_en=args.filtering)
            labels = list(itertools.chain(*labels))

            trainer.append(features, labels)
        trainer.train(args.learn.encode('utf-8'))

        print "... done in", get_duration(t1_secs=t1, t2_secs=time.time())

        # B. Evaluation
        print 'Evaluating classifier'
        gold_set = eval_set
        predicted_set_acc = list()

        # B.1. Load trained model
        tagger = CRFTagger(verbose=args.verbose,
                           training_opt={'feature.minfreq': 10})
        trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
        trainer.set_params(tagger._training_options)
        tagger.set_model_file(args.learn.encode('utf-8'))

        # B.2 Tagging segment by segment
        predicted_set = list()
        for p, sent in enumerate(gold_set):

            [tokens, gold_labels] = make_tokens_from_sentence(sent, True)
            features = make_features_from_tokens(tokens, True)
            labels = tagger._tagger.tag(features)
            labels = reshape_tokens_as_sentnece(labels, sent)

            predicted_tokens = list()
            for i, token in enumerate(sent):
                predicted_tokens.append(map(list, zip(tokens[i], labels[i])))
            predicted_set.append(predicted_tokens)

        # B.3 Assemble segements to get annotated token
        if not predicted_set_acc:
            predicted_set_acc = \
             [[[['',''] for syllabe in token] for token in sent] for sent in predicted_set]

        predicted_set_acc = accumulate_tone_code_of_dataset(
            predicted_set_acc, predicted_set)
        predicted_set = predicted_set_acc

        if args.filtering:
            gold_set = apply_filter_to_base_element(gold_set,
                                                    sel_en=args.filtering)

        print "Accuracy : {:>5.3f}".format(
            accuray2(gold_set, predicted_set, True))

        if args.store:
            stored_filename = args.store
            csv_export(stored_filename, gold_set, predicted_set, True)

        if args.verbose and args.store:
            print("Tagged result is exported in {}".format(
                args.store.encode('utf-8')))

    elif args.diacritize and args.infile and args.outfile:

        t1 = time.time()
        # todo : store and load chunkmode value

        # A.1. Load a CRF tagger
        tagger = CRFTagger()
        tagger.set_model_file(args.diacritize.encode('utf-8'))

        # Making observation data from undiacritized text
        fr = fileReader.fileReader(args.markers)
        allsents = []
        print 'Making observation data from diacritized text'

        # non-processed token -> non-processed sentence
        for sentence in fr.read(args.infile):
            sent = []
            for token in sentence:
                sent.append(
                    token[1]
                )  # token[1] : non-processed token from a undiacritized text
            #if len(sent) > 1:
            allsents.append(sent)

        # Word segmentation
        enc = encoder_tones()
        allsents2 = allsents
        allsents = []
        for sent in allsents2:
            sent2 = []
            for token in sent:
                # here, we use encode as a simple chunker to get segment level
                [NONE,
                 chunks] = enc.differential_encode(token, token, chunkmode)
                # put (chunk,chunk) instead of chunk to fit the input format of "make_tokens_from_sentence"
                token2 = [(chunk, chunk) for chunk in chunks]
                sent2.append(token2)
            allsents.append(sent2)

        # A.2 Tagging segment by segment
        predicted_set = list()
        for p, sent in enumerate(allsents):

            [tokens, NONE] = make_tokens_from_sentence(sent, True)
            features = make_features_from_tokens(tokens, True)
            labels = tagger._tagger.tag(features)
            if args.verbose:
                sys.stdout.write(u"{}/{}\n".format(p, len(allsents)))
            labels = reshape_tokens_as_sentnece(labels, sent)

            predicted_tokens = list()
            for i, token in enumerate(sent):
                predicted_tokens.append(map(list, zip(tokens[i], labels[i])))
            predicted_set.append(predicted_tokens)

    # simple raw file writer
        cara_to_ignore = \
                      fr.get_cat_startwith('Zl') + \
                      fr.get_cat_startwith('Zp') + \
                      fr.get_cat_startwith('Zs') + u'\n' + \
                      fr.get_cat_startwith('Pi') + \
                      fr.get_cat_startwith('Pf') + \
                      fr.get_cat_startwith('Po')

        enc = encoder_tones()
        with fileReader.utf8_open(args.outfile, 'w') as fidout:
            for sent in predicted_set:
                for token in sent:
                    form = u''
                    for syllabe in token:
                        #if type(syllabe[0]) == type(cara_to_ignore) :
                        #	print "good syllable type"
                        #else :
                        #	print "bad syllable type"
                        # syllabe[0], syllabe[1] -> token by chunk, label by chunk
                        if syllabe[0] in cara_to_ignore:
                            form += syllabe[0]
                        else:
                            form += enc.differential_decode(
                                syllabe[0], syllabe[1].decode('utf-8'))
                    fidout.write(form)
                #fidout.write('\n')

            print u"... done in", get_duration(t1_secs=t1, t2_secs=time.time())
Beispiel #12
0
def main():
    aparser = argparse.ArgumentParser(description='Daba disambiguator')
    aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
    aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
    aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
    aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
    aparser.add_argument('-r', '--root', help='Corpus root dir')
    aparser.add_argument('-f', '--filelist', help='Path to a list of files to learn from')
    # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
    aparser.add_argument('-e', '--evalsize', type=int, default=10,
                         help='Percent of training data with respect to training and test one (default 10)')
    aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None)
    aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true')
    aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin)
    aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout)
    aparser.add_argument('-s', '--store', help='Store tagged raw data in file (.csv) for further research purpose', default=None)

    args = aparser.parse_args()
    if args.verbose:
        print args

    if args.learn and (args.pos or args.tone or args.gloss):

        if not (args.pos or args.tone or args.gloss):
            print 'Choose pos, tone, gloss or combination of them'
            exit(0)

        print 'Make list of files'
        allfiles = []
        with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist:
            for line in filelist:
                allfiles.append(line.strip())
        allsents = []

        # pour le débogage
        # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'

        if args.tone:
            try:
                enc = encoder_tones()
            except:
                enc = None
                print ("Error : unable to initialize the tone encoder !")

        print 'Open files and find features / supervision tags'
        for infile in allfiles:
            if(infile):
                print '-', infile
                sent = []

                html_parser = FileParser()
                html_parser.read_file(os.path.join(args.root, infile))

                for snum, sentence in enumerate(html_parser.glosses):
                    for tnum, token in enumerate(sentence[2]):
                        tag = ''
                        if token.type == 'w' or token.type == 'c':
                            tags = ''
                            if args.pos:
                                tags = '/'.join(token.gloss.ps).encode('utf-8')
                                wordform = detone(token.gloss.form)
                                sent.append((wordform, tags))
                            elif args.tone:
                                # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ?
                                # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est
                                # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter
                                # une réélle amélioration dans la modélisation de tonalisation. Néanmoins,
                                # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans
                                # les données d'entraînement et d'en observer le apport
                                if '|' not in token.gloss.form :
                                    [codes, chunks] = enc.differential_encode(token.token, token.gloss.form)
                                    for chunk, code in zip(chunks, codes) :
                                        try : sent.append((chunk, code.encode('utf-8')))
                                        except LookupError: pass
                            """
                            elif args.gloss:
                                tags += token.gloss.gloss.encode('utf-8')
                                sent.append((token.token, tags))
                            """

                    if len(sent) > 1:
                        allsents.append(sent)
                        sent = []

        if args.verbose and args.tone:
            enc.report()

        # Constitution des ensmebles d'entraînement de d'évaluation
        p = (1 - args.evalsize / 100.0)
        train_set, eval_set = sampling(allsents, p)
        print 'Split the data in train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)'

        print 'Building classifier (CRF/NLTK)'
        # Initialization
        t1 = time.time()
        if args.tone:
            num_phases = len([False, True]) * len(mode_indicators)
            myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
        else:
            num_phases = 1

        # Training
        for phase in range(num_phases):
            tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
            trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
            else:
                model_name = args.learn

            # train_set : list(list((str,list(str))))
            for sent in train_set:
                tokens = unzip(sent)[0]
                labels = unzip(sent)[1]
                if num_phases > 1:
                    for lab in labels:
                        pass
                    labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
                features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
                trainer.append(features, labels)
            trainer.train(model = model_name)
            if num_phases > 1:
                myzip.write(model_name)
                os.remove(model_name)
        if num_phases > 1:
            myzip.close()

        print "... done in", get_duration(t1_secs=t1, t2_secs=time.time())

        # Evaluation
        print 'Evaluating classifier'
        # gold_set, predicted_set : list(list((str, str)))
        # input_set, output_gold_set : list(list(str))
        gold_set = eval_set
        input_set = [unzip(sent)[0] for sent in gold_set]
        predicted_set = [list() for sent in gold_set]
        if num_phases > 1:
            myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq' : 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
                myzip.extract(model_name)
            else:
                model_name = args.learn
            tagger.set_model_file(model_name)
            for i, sent in enumerate(input_set):
                features = [_get_features_customised_for_tones(sent,j) for j in range(len(sent))]
                labels = tagger._tagger.tag(features)
                if num_phases > 1:
                    labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
                tagged_sent = list(zip(sent, labels))
                if not predicted_set[i]:
                    predicted_set[i] = tagged_sent
                else:
                    sent_acc, labels_acc = unzip(predicted_set[i])
                    labels_acc = [label_acc + label for label_acc, label in zip(labels_acc, labels)]
                    predicted_set[i] = list(zip(sent_acc, labels_acc))
            if num_phases > 1:
                os.remove(model_name)
                myzip.close()

        # gold_tokens, predicted_tokens : list((str,str))
        predicted_tokens = list(itertools.chain(*predicted_set))
        if num_phases > 1:
            predicted_tokens = [
                tuple([pair[0], code_resort(pair[1].decode('utf-8')).encode('utf-8')])
                for pair in predicted_tokens]
        gold_tokens = list(itertools.chain(*gold_set))
        # gold_tokens_eval, predicted_tokens_eval : list(str)
        if args.tone:
            gold_tokens_eval = getTag(gold_tokens)
            predicted_tokens_eval = getTag(predicted_tokens)
        else:
            gold_tokens_eval = gold_tokens
            predicted_tokens_eval = predicted_tokens

        if args.store and args.tone:
            stored_filename = args.store
            csv_export(enc, stored_filename, gold_tokens, predicted_tokens)

        print "Accuracy : {:>5.3f}".format(accuracy(gold_tokens_eval, predicted_tokens_eval))

        if args.verbose and args.store:
            print ("Tagged result is exported in {}".format(args.store))

    elif args.disambiguate and args.infile and args.outfile:
        # Lecture de texte en .HTML
        html_parser = FileParser()
        tagger = CRFTagger()

        if args.pos:
            try:
                tagger.set_model_file(args.disambiguate)
            except IOError:
                print "Error : unable to open the model {} !".format(args.infile)
                exit(1)
            try:
                html_parser.read_file(args.infile)
            except IOError:
                print "Error : unable to open the input file {} !".format(args.infile)
                exit(1)

            # Exportation du résultat de désambiguïsation en .HTML
            for snum, sentence in enumerate(html_parser.glosses):
                tokens = [token.token for token in sentence[2]]
                features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
                tagger._tagger.set(features)
                for tnum, token in enumerate(sentence[2]):
                    options = list()
                    if token.value and len(token.value) > 2:
                        for nopt, option in enumerate(token.value[2]):
                            try:
                                tag = option.ps[0]
                            except IndexError:
                                tag = ''
                            prob = tagger._tagger.marginal(tag, tnum)
                            options.append((prob, option))
                        reordered_probs, reordered_options = unzip(sorted(options, reverse = True))
                        if args.select:
                            prob_max = reordered_probs[0]
                            reordered_options = tuple([
                                reordered_options[i]
                                for i, p in enumerate(reordered_probs)
                                if p >= prob_max])
                        html_parser.glosses[snum][1][tnum] = reordered_options

        elif args.tone:
            pass

        try:
            html_parser.write(args.outfile)
        except IOError: print "Error : unable to create the output file {}".format(args.outfile)

    else:
        aparser.print_help()
    exit(0)
y = np.array(y)
y_hat = np.array(y_hat)

print("hmm acc : ", (y == y_hat).mean())

#named entities recognition
import pickle

a = pickle.load(
    open(
        "/users/Etu0/3770640/M1/Sem2/TAL/TME1/maxent_ne_chunker/PY3/english_ace_multiclass.pickle",
        "rb"))

from nltk.tag.crf import CRFTagger

tagger = CRFTagger()
tagger.train(alldocs, u'crf.model'
             )  # donner en plus le fichier de stockage du calcul des features

tagger.tag(['Je suis à la maison'])
print(tagger._get_features([u"Je"], 0))

from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger(load=False)
tagger.train(alldocs)

# adT_seq: liste de liste de mots (=liste de phrase)
allpred_smart = [[t for w, t in tagger.tag(adT_seq[i])]
                 for i in range(len(adT_seq))]
allpred_stupid = [[tagger.tag([w])[0][1] for w in adT_seq[i]]
                  for i in range(len(adT_seq))]
Beispiel #14
0
def main():
	
	aparser = argparse.ArgumentParser(description='Daba disambiguator')
	# aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin")
	# aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout")
	aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
	aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
	aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
	aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
	aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10)
	aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
	args = aparser.parse_args()

	if args.learn:

		if not args.pos or args.tone or args.gloss:
			print 'Choose pos, tone, gloss or combination of them'
			exit(0)

		print 'Make list of files'
		files1 = glob.iglob("../corbama/*/*.dis.html")
		files2 = glob.iglob("../corbama/*.dis.html")
		allfiles = ""
		for file1, file2 in zip(files1, files2):
			allfiles += file1+','+file2+','
		allsents = []

		print 'Open files and find features / supervision tags'
		for infile in allfiles.split(','):
			if(len(infile)) :
				print '-', infile
				sent = []
				in_handler = formats.HtmlReader(infile, compatibility_mode=False)
				for token in in_handler:
					tag = ''
					if token.type == 'w' or token.type == 'c':
						tags = ''
						if args.pos:
							for ps in token.gloss.ps:
								tags += ps
						if args.tone:
							tags += token.gloss.form.encode('utf-8')
						if args.gloss:
							tags += token.gloss.gloss.encode('utf-8')
						sent.append((token.token, tags))
					if token.type == 'c' and token.token in ['.', '?', '!']:
						if len(sent) > 1:
							allsents.append(sent)
						sent = []

		datalength = len(allsents)
		p = (1-args.evalsize/100.0)
		print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)'
		random.seed(123456)
		random.shuffle(allsents)
		train_set = allsents[:int(p*datalength)]
		test_set = allsents[int(p*datalength):datalength]

		print 'Building classifier (CRF/NLTK)'
		tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
		t1 = time.time()
		tagger.train(train_set, args.learn)
		t2 = time.time()
		texec = t2-t1
		print "... done in",  time.strftime('%H %M %S', time.localtime(texec))

		print 'Evaluating classifier'
		tagger.evaluate(test_set)

		if args.verbose:
			print 'Compute detailed output'

	else:
		print 'USE...'

	exit(0)
Beispiel #15
0
def main(positive, death):
    ############# Compile the dataset ###############
    ## Load the dataset
    text = list()
    response = list()
    file_path = [positive, death]

    for path in file_path:
        input_file = jsonlines.open(path)
        for obj in input_file:
            text.append(obj['text'])
            response.append(obj['annotation']['part1.Response'])

    ## Tweet Preprocessing
    prep_text = list()
    for i in text:
        prep_text.append(p.clean(i))

    ## Tag Keywords and Create Labels
    ### Focus on verbs--therefore, try lemmatization first
    wnl = WordNetLemmatizer()
    n_corpus = len(prep_text)
    token_data = ["test"] * n_corpus

    n = 0
    for sent in prep_text:
        token_data[n] = [
            wnl.lemmatize(i, j[0].lower())
            if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i)
            for i, j in pos_tag(word_tokenize(sent))
        ]
        n = n + 1

    ### Create labels
    death_list = ["die", "dead", "death", "pass", "away"]

    n = 0
    for sent in token_data:
        for idx, token in enumerate(sent):
            if ((token.lower() in ["test", "positive", "result"])
                    and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "P-Yes"]
            elif ((token.lower() in ["test", "positive", "result"])
                  and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "P-No"]
            elif ((token.lower() in death_list) and (response[n] == ["yes"])):
                sent[idx] = [sent[idx], "D-Yes"]
            elif ((token.lower() in death_list) and (response[n] == ["no"])):
                sent[idx] = [sent[idx], "D-No"]
            else:
                sent[idx] = [sent[idx], "Irr"]
        n = n + 1

    ## Shuffle and split into train data and dev data
    token_data = shuffle(token_data, random_state=6)
    train_data, dev_data = train_test_split(token_data,
                                            test_size=0.3,
                                            random_state=616)
    print(
        f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};"
    )

    ############# Fit A CRF Model And Predict ###############
    condition_to_func = {
        "base": my_features,
        "include_neighbors": neighbor_features
    }
    for cond, func in condition_to_func.items():
        # initialize
        crf = CRFTagger(feature_func=func)
        crf.train(train_data, 'model.tagger')
        # Test
        crf._feature_func(prep_text[0].split(), 7)
        crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']])

        # Output
        filename = cond + "_final_output.tsv"
        with open(filename, 'w') as pred_file:
            for sent in dev_data:
                sent_words = [item[0] for item in sent]
                gold_tags = [item[1] for item in sent]

                with_tags = crf.tag(sent_words)
                for i, output in enumerate(with_tags):
                    original_word, tag_prediction = output
                    line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n"
                    pred_file.write(line_as_str)
                # add an empty line after each sentence
                pred_file.write("\n")

    ############# Evaluation ###############
    ## Extract Data with Meaning Labels
    cond_list = ['base', 'include_neighbors']

    for cond in cond_list:
        filename = cond + "_final_output.tsv"

        with open(filename) as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            D_data = []
            P_data = []
            for row in rd:
                if len(row) > 1:
                    if row[1] in ['P-Yes', 'P-No']:
                        P_data.append(row)
                    elif row[1] in ['D-Yes', 'D-No']:
                        D_data.append(row)

        column_name = ['token', 'label', 'prediction']
        P_df = pd.DataFrame(P_data, columns=column_name)
        D_df = pd.DataFrame(D_data, columns=column_name)
        Total_df = P_df.append(D_df)

        # Accuracy
        ## Overall Accuracy
        T_a = accuracy_score(Total_df['label'], Total_df['prediction'])

        ## Accuracy, Precision, and Recall for two events
        accuracy = []
        precision = []
        recall = []
        for df in [P_df, D_df]:
            accuracy.append(accuracy_score(df['label'], df['prediction']))
            precision.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['prediction'][item])))
            recall.append(
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item]
                        and 'Yes' in df['prediction'][item])) /
                sum(1 for item in range(0,
                                        len(df) - 1)
                    if ('Yes' in df['label'][item])))

        ## F-1
        f1 = []
        for num in [0, 1]:
            f1.append((2 * precision[num] * recall[num]) /
                      (precision[num] + recall[num]))

        # Report performance
        print("condition: " + cond)
        print(f"Overall Accuracy {T_a:0.03}")
        covid_event = ['Test Positive', 'Death Case']

        num = 0
        for event in covid_event:
            print(
                f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}"
            )
            num = num + 1

    ## Basicline Performance / Confusion Matrix
    print("Confusion Matrix:")
    print(pd.crosstab(Total_df['label'], Total_df['prediction']))
    print("Training data:")
    labels = ["P-Yes", "P-No", "D-Yes", "D-No"]
    for label in labels:
        train_data2 = np.concatenate(train_data).flat
        n_label = sum(1 for item in train_data2 if item == label)
        print(f"Number of {label}: {n_label}")