def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt): # Start Training # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % (fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code(td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code(vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags) vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] model_filename = models_folder + "/" + "%i_%s__%s" % (fold, code, str(randint(0, 9999999))) # documentation: http://www.chokkan.org/software/crfsuite/manual.html training_opt = {"feature.possible_states": False, "feature.possible_transitions": False, "c2": 2.0 } model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td, model_filename) wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) # Delete model file now predictions obtained # Note, we are randomizing name above, so we need to clean up here os.remove(model_filename) td_wd_predictions_by_code[code] = to_flattened_binary_tags(td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags(vd_predictions) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold, code_freq, training_opt): # Start Training # Start Training print("Fold %i Training code" % fold) # For training td_sents = to_most_common_code_tagged_sentences(essays_TD, regular_tags, code_freq) vd_sents = to_most_common_code_tagged_sentences(essays_VD, regular_tags, code_freq) model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, "most_freq_code", str(randint(0, 9999999))) model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td_sents, model_filename) td_predictions = model.tag_sents(to_sentences(td_sents)) vd_predictions = model.tag_sents(to_sentences(vd_sents)) # for evaluation - binary tags # YS (ACTUAL) td_sents_pset = to_label_powerset_tagged_sentences(essays_TD, regular_tags) vd_sents_pset = to_label_powerset_tagged_sentences(essays_VD, regular_tags) wd_td_ys_bytag = to_flattened_binary_tags_by_code(td_sents_pset, regular_tags) wd_vd_ys_bytag = to_flattened_binary_tags_by_code(vd_sents_pset, regular_tags) # YS (PREDICTED) td_wd_predictions_by_code = to_flattened_binary_tags_by_code( td_predictions, regular_tags) vd_wd_predictions_by_code = to_flattened_binary_tags_by_code( vd_predictions, regular_tags) os.remove(model_filename) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def train_classifer_on_fold(essays_TD, essays_VD, regular_tags, fold): td_sents_by_code = to_tagged_sentences_by_code(essays_TD, regular_tags) vd_sents_by_code = to_tagged_sentences_by_code(essays_VD, regular_tags) wd_td_ys_bytag = dict() wd_vd_ys_bytag = dict() td_wd_predictions_by_code = dict() vd_wd_predictions_by_code = dict() for code in sorted(regular_tags): print("Fold %i Training code: %s" % (fold, code)) td, vd = td_sents_by_code[code], vd_sents_by_code[code] model_filename = models_folder + "/" + "%i_%s__%s" % ( fold, code, str(randint(0, 9999999))) # documentation: http://www.chokkan.org/software/crfsuite/manual.html training_opt = { "feature.possible_states": False, "feature.possible_transitions": False, "c2": 2.0 } model = CRFTagger(feature_func=comp_feat_extactor, verbose=False, training_opt=training_opt) model.train(td, model_filename) wd_td_ys_bytag[code] = to_flattened_binary_tags(td) wd_vd_ys_bytag[code] = to_flattened_binary_tags(vd) td_predictions = model.tag_sents(to_sentences(td)) vd_predictions = model.tag_sents(to_sentences(vd)) # Delete model file now predictions obtained # Note, we are randomizing name above, so we need to clean up here os.remove(model_filename) td_wd_predictions_by_code[code] = to_flattened_binary_tags( td_predictions) vd_wd_predictions_by_code[code] = to_flattened_binary_tags( vd_predictions) return wd_td_ys_bytag, wd_vd_ys_bytag, td_wd_predictions_by_code, vd_wd_predictions_by_code
def demo(self, test_sents): tagger = CRFTagger(feature_func=self.feature_detector) tagger.set_model_file(self.modelpath) for sent in test_sents: tagged = tagger.tag(untag(sent)) for s in self._to_sentence(tagged): print(s) print(tagger.evaluate(test_sents))
class CRF: def __init__(self, config): self.word_ftrs = config['word_ftrs'] for ftr in self.word_ftrs: if ftr not in CRF.WORD_FTRS: raise Exception( 'Unknown feature {}. See CRF.WORD_FTRS for supported ones.' .format(CRF.WORD_FTRS)) self.stc_ftrs = config['stc_ftrs'] for ftr in self.stc_ftrs: if ftr not in CRF.STC_FTRS: raise Exception( 'Unknown feature {}. See CRF.STC_FTRS for supported ones.'. format(CRF.STC_FTRS)) self.words_ids = config['extr_word_idx'] def prep_data(self, file='data/HaaretzOrnan_annotated.txt'): self.data = [] with codecs.open(file, encoding='utf-8') as f: lines = f.readlines() self.data.append([]) for line in lines: line = line.rstrip() # Start new sentence if line.startswith(u'#'): continue if len(line) == 0: if len(self.data[-1]) > 0: self.data.append([]) continue # Append word to last sentence w = line.split(u' ')[3] w = w.replace(u'-', u'') self.data[-1].append(w) # If sentence is empty - remove it if len(self.data[-1]) == 0: self.data.remove(self.data[-1]) return self def shuffle(self, seed=None): # Shuffle based on seed inds = np.arange(len(self.data)) np.random.seed(seed) np.random.shuffle(inds) self.data = [self.data[i] for i in inds] return self def split(self, valid_ratio=0.1): # Split to train and validation based on ratio. # If ratio is 0 use all data for training num_train = int(len(self.data) * (1 - valid_ratio)) self.train_set = self.data[:num_train] self.valid_set = None if valid_ratio == 0 else self.data[num_train:] return self def train(self, load_model=None): train_set = CRF._fin_data_prep(self.train_set) _extract_ftr = self._gen_ftr_func() self.model = CRFTagger(_extract_ftr, verbose=False, training_opt={ "num_memories": 500, "delta": 1e-8 }) self.model.train(train_set, 'stc_crf_model') return self def eval(self): conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS))) valid_set = CRF._fin_data_prep(self.valid_set) valid_stc_cons = [[x[0] for x in w] for w in valid_set] valid_stc_vowel = [[x[1] for x in w] for w in valid_set] predicted = self.model.tag_sents(valid_stc_cons) predicted = [[x[1] for x in w] for w in predicted] for w_ind in range(len(predicted)): for vow_ind, pred_vow in enumerate(predicted[w_ind]): conf_mat[self.VOWELS_IDX[pred_vow], self.VOWELS_IDX[valid_stc_vowel[w_ind][vow_ind]]] += 1 return conf_mat def predict(self, pred_set): data = [] for sent in pred_set: sent_cons = u' '.join(sent) for i, w in enumerate(sent): w_cons = list(w) w_pos = [i] * len(w) unif_sent = [sent_cons] * len(w) d = list(zip(w_cons, w_pos, unif_sent)) data.append(d) pred = self.model.tag_sents(data) result = [] word_idx = 0 for sent in pred_set: result.append([]) for word in sent: pred_smpl = pred[word_idx] w = ''.join([entry[0][0] + entry[-1] for entry in pred_smpl]) result[-1].append(w) word_idx += 1 return result @staticmethod def _fin_data_prep(data_set): data = [] for sent in data_set: sent_cons = u' '.join([x[::2] for x in sent]) for i, w in enumerate(sent): w_cons = list(w[::2]) w_pos = [i] * len(w[::2]) unif_sent = [sent_cons] * len(w[::2]) d = list(zip(w_cons, w_pos, unif_sent)) data.append(list(zip(d, list(w[1::2])))) return data @staticmethod def _len(x): return len(x) if isinstance(x, str) else int(x) VOWELS = [u'a', u'e', u'u', u'i', u'o', u'*'] VOWELS_IDX = {x: i for i, x in enumerate(VOWELS)} WORD_FTRS = [ 'IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN' ] STC_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX'] def _gen_ftr_func(self): # Closure def _extract_ftr(tokens, i): def _extract_wrd_ftr(tokens, i, suff): feature_list = [] if i is not None: if 'IS_FIRST' in self.word_ftrs: feature_list.append("is_first{}={}".format( suff, 1 if i == 0 else 0)) if 'IS_LAST' in self.word_ftrs: feature_list.append("is_last{}={}".format( suff, 1 if i == (len(tokens) - 1) else 0)) if 'IDX' in self.word_ftrs: feature_list.append("pos{}={}".format(suff, i)) if 'VAL' in self.word_ftrs: feature_list.append("cur{}={}".format(suff, tokens[i])) if 'PRV_VAL' in self.word_ftrs: if i > 0: feature_list.append("prev{}={}".format( suff, tokens[i - 1])) if 'NXT_VAL' in self.word_ftrs: if i < (len(tokens) - 1): feature_list.append("next{}={}".format( suff, tokens[i + 1])) if 'FRST_VAL' in self.word_ftrs: feature_list.append("first{}={}".format(suff, tokens[0])) if 'LST_VAL' in self.word_ftrs: feature_list.append("last{}={}".format(suff, tokens[-1])) if 'LEN' in self.word_ftrs: feature_list.append("len{}={}".format(suff, len(tokens))) if 'SCND_VAL' in self.word_ftrs: if len(tokens) > 1: feature_list.append("scnd{}={}".format( suff, tokens[1])) if 'SCND_LST_VAL' in self.word_ftrs: if len(tokens) > 1: feature_list.append("scnd_last{}={}".format( suff, tokens[-2])) return feature_list feature_list = [] word_pos = tokens[0][1] sent = tokens[0][2].split(' ') # Sentence features if 'IS_FIRST' in self.stc_ftrs: if word_pos == 0: feature_list.append('FIRST_WORD') if 'IS_LAST' in self.stc_ftrs: if word_pos == (len(sent) - 1): feature_list.append('LAST_WORD') if 'IDX' in self.stc_ftrs: feature_list.append("idx=" + str(word_pos)) # word features for rel_pos in self.words_ids: word_pos = tokens[0][1] + rel_pos if word_pos >= 0 and word_pos < len(sent): word = sent[word_pos] feature_list += _extract_wrd_ftr( word, i if rel_pos == 0 else None, '_w{}'.format(rel_pos)) return feature_list return _extract_ftr
class CRF: def __init__(self, config): self.ftrs = config['ftrs'] for ftr in self.ftrs: if ftr not in CRF.WORD_FTRS: raise Exception('Unknown feature {}. See CRF.CONFIG for supported ones.'.format(CRF.WORD_FTRS)) def prep_data(self, file='data/HaaretzOrnan_annotated.txt'): self.data = [] # print('Preparing data') with codecs.open(file, encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.rstrip() if line.startswith(u'#') or len(line) == 0: continue w = line.split(u' ')[3] w = w.replace(u'-', u'') self.data.append(list(zip(list(w[::2]), list(w[1::2])))) return self def shuffle(self, seed=None): # Shuffle based on seed inds = np.arange(len(self.data)) np.random.seed(seed) np.random.shuffle(inds) self.data=[self.data[i] for i in inds] return self def split(self, valid_ratio=0.1): # Split to train and validation based on ratio. # If ratio is 0 use all data for training num_train = int(len(self.data)*(1-valid_ratio)) self.train_set = self.data[:num_train] self.valid_set = None if valid_ratio==0 else self.data[num_train:] return self def train(self, load_model=None): _extract_ftr = self._gen_ftr_func() self.model = CRFTagger(_extract_ftr, verbose=False, training_opt={"num_memories": 500, "delta": 1e-8}) self.model.train(self.train_set, 'word_crf_model') return self def eval(self): conf_mat = np.zeros((len(CRF.VOWELS), len(CRF.VOWELS))) valid_word_cons = [[x[0] for x in w] for w in self.valid_set] valid_word_vowel = [[x[1] for x in w] for w in self.valid_set] predicted = self.model.tag_sents(valid_word_cons) predicted = [[x[1] for x in w] for w in predicted] for w_ind in range(len(predicted)): for vow_ind, pred_vow in enumerate(predicted[w_ind]): conf_mat[self.VOWELS_IDX[pred_vow], self.VOWELS_IDX[valid_word_vowel[w_ind][vow_ind]]] += 1 return conf_mat def predict(self, pred_set): result = [] for sent in pred_set: pred_sent = [] predicted = self.model.tag_sents(sent) for i, w_cons in enumerate(predicted): pred_sent.append(''.join(x+y for x, y in w_cons)) result.append(pred_sent) return result @staticmethod def _len(x): return len(x) if isinstance(x, str) else int(x) VOWELS = [u'a',u'e',u'u',u'i',u'o',u'*'] VOWELS_IDX = {x:i for i,x in enumerate(VOWELS)} WORD_FTRS = ['IS_FIRST', 'IS_LAST', 'IDX', 'VAL', 'PRV_VAL', 'NXT_VAL', 'FRST_VAL', 'LST_VAL', 'SCND_VAL', 'SCND_LST_VAL', 'LEN'] def _gen_ftr_func(self): # Closure def _extract_ftr(tokens, i): # print(tokens, i, tokens[i]) feature_list = [] if 'IS_FIRST' in self.ftrs: feature_list.append("is_first="+str(1 if i == 0 else 0)) if 'IS_LAST' in self.ftrs: feature_list.append("is_last="+str(1 if i == (len(tokens)-1) else 0)) if 'IDX' in self.ftrs: feature_list.append("pos="+str(i)) if 'VAL' in self.ftrs: feature_list.append("cur="+tokens[i]) if 'PRV_VAL' in self.ftrs: if i > 0: feature_list.append("prev="+tokens[i-1]) if 'NXT_VAL' in self.ftrs: if i < (len(tokens)-1): feature_list.append("next="+tokens[i+1]) if 'FRST_VAL' in self.ftrs: feature_list.append("first="+tokens[0]) if 'LST_VAL' in self.ftrs: feature_list.append("last="+tokens[-1]) if 'LEN' in self.ftrs: feature_list.append("len="+str(len(tokens))) if 'SCND_VAL' in self.ftrs: if len(tokens)>1: feature_list.append("scnd="+tokens[1]) if 'SCND_LST_VAL' in self.ftrs: if len(tokens)>1: feature_list.append("scnd_last="+tokens[-2]) return feature_list return _extract_ftr
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') aparser.add_argument( '-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-r', '--root', help='Corpus root dir') aparser.add_argument('-f', '--filelist', help='Path to a list of files to learn from') # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument( '-e', '--evalsize', type=int, default=10, help= 'Percent of training data with respect to training and test one (default 10)' ) aparser.add_argument( '-d', '--disambiguate', help= 'Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None) aparser.add_argument( '--select', help= 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true') aparser.add_argument('-i', '--infile', help='Input file (.html)', default=sys.stdin) aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout) aparser.add_argument( '-s', '--store', help= 'Store tagged raw data in file (.csv) for further research purpose', default=None) args = aparser.parse_args() if args.verbose: print(args) if args.learn and (args.pos or args.tone or args.gloss): if not (args.pos or args.tone or args.gloss): print('Choose pos, tone, gloss or combination of them') exit(0) print('Make list of files') allfiles = [] with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist: for line in filelist: allfiles.append(line.strip()) allsents = [] # pour le débogage # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html' if args.tone: try: enc = encoder_tones() except: enc = None print(("Error : unable to initialize the tone encoder !")) print('Open files and find features / supervision tags') for infile in allfiles: if (infile): print('-', infile) sent = [] html_parser = FileParser() html_parser.read_file(os.path.join(args.root, infile)) for snum, sentence in enumerate(html_parser.glosses): for tnum, token in enumerate(sentence[2]): tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: tags = '/'.join(token.gloss.ps) wordform = detone(token.gloss.form) sent.append((wordform, tags)) elif args.tone: # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ? # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter # une réélle amélioration dans la modélisation de tonalisation. Néanmoins, # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans # les données d'entraînement et d'en observer le apport if '|' not in token.gloss.form: [codes, chunks] = enc.differential_encode( token.token, token.gloss.form) for chunk, code in zip(chunks, codes): try: sent.append((chunk, code)) except LookupError: pass """ elif args.gloss: tags += token.gloss.gloss sent.append((token.token, tags)) """ if len(sent) > 1: allsents.append(sent) sent = [] if args.verbose and args.tone: enc.report() # Constitution des ensmebles d'entraînement de d'évaluation p = (1 - args.evalsize / 100.0) train_set, eval_set = sampling(allsents, p) print('Split the data in train (', len(train_set), ' sentences) / test (', len(eval_set), ' sentences)') print('Building classifier (CRF/NLTK)') # Initialization t1 = time.time() if args.tone: num_phases = len([False, True]) * len(mode_indicators) myzip = zipfile.ZipFile(args.learn + '.zip', 'w') else: num_phases = 1 # Training for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) else: model_name = args.learn # train_set : list(list((str,list(str)))) for sent in train_set: tokens = unzip(sent)[0] labels = unzip(sent)[1] if num_phases > 1: for lab in labels: pass labels = [ code_dispatcher(label)[phase] for label in labels ] features = [ _get_features_customised_for_tones(tokens, i) for i in range(len(tokens)) ] trainer.append(features, labels) trainer.train(model=model_name) if num_phases > 1: myzip.write(model_name) os.remove(model_name) if num_phases > 1: myzip.close() print("... done in", get_duration(t1_secs=t1, t2_secs=time.time())) # Evaluation print('Evaluating classifier') # gold_set, predicted_set : list(list((str, str))) # input_set, output_gold_set : list(list(str)) gold_set = eval_set input_set = [unzip(sent)[0] for sent in gold_set] predicted_set = [list() for sent in gold_set] if num_phases > 1: myzip = zipfile.ZipFile(args.learn + '.zip', 'r') for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) myzip.extract(model_name) else: model_name = args.learn tagger.set_model_file(model_name) for i, sent in enumerate(input_set): features = [ _get_features_customised_for_tones(sent, j) for j in range(len(sent)) ] labels = tagger._tagger.tag(features) if num_phases > 1: labels = [ code_dispatcher(label)[phase] for label in labels ] tagged_sent = list(zip(sent, labels)) if not predicted_set[i]: predicted_set[i] = tagged_sent else: sent_acc, labels_acc = unzip(predicted_set[i]) labels_acc = [ label_acc + label for label_acc, label in zip(labels_acc, labels) ] predicted_set[i] = list(zip(sent_acc, labels_acc)) if num_phases > 1: os.remove(model_name) myzip.close() # gold_tokens, predicted_tokens : list((str,str)) predicted_tokens = list(itertools.chain(*predicted_set)) if num_phases > 1: predicted_tokens = [ tuple([pair[0], code_resort(pair[1])]) for pair in predicted_tokens ] gold_tokens = list(itertools.chain(*gold_set)) # gold_tokens_eval, predicted_tokens_eval : list(str) if args.tone: gold_tokens_eval = getTag(gold_tokens) predicted_tokens_eval = getTag(predicted_tokens) else: gold_tokens_eval = gold_tokens predicted_tokens_eval = predicted_tokens if args.store and args.tone: stored_filename = args.store csv_export(enc, stored_filename, gold_tokens, predicted_tokens) print("Accuracy : {:>5.3f}".format( accuracy(gold_tokens_eval, predicted_tokens_eval))) if args.verbose and args.store: print(("Tagged result is exported in {}".format(args.store))) elif args.disambiguate and args.infile and args.outfile: # Lecture de texte en .HTML html_parser = FileParser() tagger = CRFTagger() if args.pos: try: tagger.set_model_file(args.disambiguate) except IOError: print("Error : unable to open the model {} !".format( args.infile)) exit(1) try: html_parser.read_file(args.infile) except IOError: print("Error : unable to open the input file {} !".format( args.infile)) exit(1) # Exportation du résultat de désambiguïsation en .HTML for snum, sentence in enumerate(html_parser.glosses): tokens = [token.token for token in sentence[2]] features = [ _get_features_customised_for_tones(tokens, i) for i in range(len(tokens)) ] tagger._tagger.set(features) for tnum, token in enumerate(sentence[2]): options = list() if token.value and len(token.value) > 2: for nopt, option in enumerate(token.value[2]): try: tag = option.ps[0] except IndexError: tag = '' prob = tagger._tagger.marginal(tag, tnum) options.append((prob, option)) reordered_probs, reordered_options = unzip( sorted(options, reverse=True)) if args.select: prob_max = reordered_probs[0] reordered_options = tuple([ reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max ]) html_parser.glosses[snum][1][tnum] = reordered_options elif args.tone: pass try: html_parser.write(args.outfile) except IOError: print("Error : unable to create the output file {}".format( args.outfile)) else: aparser.print_help() exit(0)
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') # aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin") # aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout") aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10) aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') args = aparser.parse_args() if args.learn: if not args.pos or args.tone or args.gloss: print 'Choose pos, tone, gloss or combination of them' exit(0) print 'Make list of files' files1 = glob.iglob("../corbama/*/*.dis.html") files2 = glob.iglob("../corbama/*.dis.html") allfiles = "" for file1, file2 in zip(files1, files2): allfiles += file1+','+file2+',' allsents = [] print 'Open files and find features / supervision tags' for infile in allfiles.split(','): if(len(infile)) : print '-', infile sent = [] in_handler = formats.HtmlReader(infile, compatibility_mode=False) for token in in_handler: tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: for ps in token.gloss.ps: tags += ps if args.tone: tags += token.gloss.form.encode('utf-8') if args.gloss: tags += token.gloss.gloss.encode('utf-8') sent.append((token.token, tags)) if token.type == 'c' and token.token in ['.', '?', '!']: if len(sent) > 1: allsents.append(sent) sent = [] datalength = len(allsents) p = (1-args.evalsize/100.0) print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)' random.seed(123456) random.shuffle(allsents) train_set = allsents[:int(p*datalength)] test_set = allsents[int(p*datalength):datalength] print 'Building classifier (CRF/NLTK)' tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) t1 = time.time() tagger.train(train_set, args.learn) t2 = time.time() texec = t2-t1 print "... done in", time.strftime('%H %M %S', time.localtime(texec)) print 'Evaluating classifier' print tagger.evaluate(test_set) if args.verbose: print 'Compute detailed output' else: print 'USE...' parser.print_help() exit(0)
def train(self, load_model=None): _extract_ftr = self._gen_ftr_func() self.model = CRFTagger(_extract_ftr, verbose=False, training_opt={"num_memories": 500, "delta": 1e-8}) self.model.train(self.train_set, 'word_crf_model') return self
def main(): aparser = argparse.ArgumentParser( description=u'Tonalizer - CRF-based Tone Reconstitution Tool') aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') aparser.add_argument( '-l', '--learn', help='Learn model from diacritized text (and save as file if provided)', default=None, type=lambda s: unicode(s, 'utf8')) aparser.add_argument( '-e', '--evalsize', help= 'Percent of training data with respect to training and test one (default 10)', default=10, type=float) #aparser.add_argument('-c', '--chunkmode', help='Word segmentation width (default 3)', default=3, type=int) aparser.add_argument('-d', '--diacritize', help='Use model file to diacritize a raw text', default=None) aparser.add_argument('-u', '--undiacritize', help='Undiacritize a raw text', default=False, action='store_true') aparser.add_argument('-f', '--filtering', help='Keep only one insertion for one poistion', default=False, action='store_true') aparser.add_argument('-m', '--markers', help='Custumed set of markers to learn', default=None, type=lambda s: unicode(s, 'utf8')) aparser.add_argument('-i', '--infile', help='Input file (.txt)', default=sys.stdin, type=lambda s: unicode(s, 'utf8')) aparser.add_argument('-o', '--outfile', help='Output file (.txt)', default=sys.stdout, type=lambda s: unicode(s, 'utf8')) aparser.add_argument( '-s', '--store', help= 'Store evaluation result in file (.csv), effective only in learning mode', default=None, type=lambda s: unicode(s, 'utf8')) args = aparser.parse_args() if not (args.learn or args.diacritize or args.undiacritize): print 'Error : choose -learn, -diacritize or -undiacritize !' aparser.print_help() exit(0) if args.verbose: print 'Arguments received by script' dico = vars(args) for key, val in dico.items(): typeName = type(val).__name__ sys.stdout.write(u"\t{} = {} ".format(key, val)) if val: sys.stdout.write(u"({})".format(typeName)) print "" if args.undiacritize: fr = fileReader.fileReader(args.markers) fr.read2(args.infile, args.outfile) elif args.learn: fr = fileReader.fileReader(args.markers) allsents = [] print 'Making observation data from diacritized text' for sentence in fr.read(args.infile): sent = [] for token in sentence: sent.append((token[0], token[1].encode('utf-8'))) if len(sent) > 1: allsents.append(sent) print 'Word segmentation and diacritic informaiotn compression' enc = encoder_tones() allsents2 = allsents allsents = [] for sent in allsents2: sent2 = [] for token_tags in sent: token, tags = token_tags [codes, syllabes] = enc.differential_encode(token, tags.decode('utf-8'), chunkmode) token2 = [(syllabe, code.encode('utf-8')) for syllabe, code in zip(syllabes, codes)] sent2.append(token2) allsents.append(sent2) if args.verbose: enc.report() p = (1 - args.evalsize / 100.0) train_set, eval_set = sampling(allsents, p) print 'Split the data in train (', len( train_set), ' sentences) / test (', len(eval_set), ' sentences)' print 'Building classifier (pyCRFsuite)' # Initialization t1 = time.time() # A.1. Initialize a new CRF trainer tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) # A.2. Prepare training set for sent in train_set: [tokens, labels] = make_tokens_from_sentence(sent, True) features = make_features_from_tokens(tokens, True) labels = get_sub_tone_code_of_sentence(sent, sel_en=args.filtering) labels = list(itertools.chain(*labels)) trainer.append(features, labels) trainer.train(args.learn.encode('utf-8')) print "... done in", get_duration(t1_secs=t1, t2_secs=time.time()) # B. Evaluation print 'Evaluating classifier' gold_set = eval_set predicted_set_acc = list() # B.1. Load trained model tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) tagger.set_model_file(args.learn.encode('utf-8')) # B.2 Tagging segment by segment predicted_set = list() for p, sent in enumerate(gold_set): [tokens, gold_labels] = make_tokens_from_sentence(sent, True) features = make_features_from_tokens(tokens, True) labels = tagger._tagger.tag(features) labels = reshape_tokens_as_sentnece(labels, sent) predicted_tokens = list() for i, token in enumerate(sent): predicted_tokens.append(map(list, zip(tokens[i], labels[i]))) predicted_set.append(predicted_tokens) # B.3 Assemble segements to get annotated token if not predicted_set_acc: predicted_set_acc = \ [[[['',''] for syllabe in token] for token in sent] for sent in predicted_set] predicted_set_acc = accumulate_tone_code_of_dataset( predicted_set_acc, predicted_set) predicted_set = predicted_set_acc if args.filtering: gold_set = apply_filter_to_base_element(gold_set, sel_en=args.filtering) print "Accuracy : {:>5.3f}".format( accuray2(gold_set, predicted_set, True)) if args.store: stored_filename = args.store csv_export(stored_filename, gold_set, predicted_set, True) if args.verbose and args.store: print("Tagged result is exported in {}".format( args.store.encode('utf-8'))) elif args.diacritize and args.infile and args.outfile: t1 = time.time() # todo : store and load chunkmode value # A.1. Load a CRF tagger tagger = CRFTagger() tagger.set_model_file(args.diacritize.encode('utf-8')) # Making observation data from undiacritized text fr = fileReader.fileReader(args.markers) allsents = [] print 'Making observation data from diacritized text' # non-processed token -> non-processed sentence for sentence in fr.read(args.infile): sent = [] for token in sentence: sent.append( token[1] ) # token[1] : non-processed token from a undiacritized text #if len(sent) > 1: allsents.append(sent) # Word segmentation enc = encoder_tones() allsents2 = allsents allsents = [] for sent in allsents2: sent2 = [] for token in sent: # here, we use encode as a simple chunker to get segment level [NONE, chunks] = enc.differential_encode(token, token, chunkmode) # put (chunk,chunk) instead of chunk to fit the input format of "make_tokens_from_sentence" token2 = [(chunk, chunk) for chunk in chunks] sent2.append(token2) allsents.append(sent2) # A.2 Tagging segment by segment predicted_set = list() for p, sent in enumerate(allsents): [tokens, NONE] = make_tokens_from_sentence(sent, True) features = make_features_from_tokens(tokens, True) labels = tagger._tagger.tag(features) if args.verbose: sys.stdout.write(u"{}/{}\n".format(p, len(allsents))) labels = reshape_tokens_as_sentnece(labels, sent) predicted_tokens = list() for i, token in enumerate(sent): predicted_tokens.append(map(list, zip(tokens[i], labels[i]))) predicted_set.append(predicted_tokens) # simple raw file writer cara_to_ignore = \ fr.get_cat_startwith('Zl') + \ fr.get_cat_startwith('Zp') + \ fr.get_cat_startwith('Zs') + u'\n' + \ fr.get_cat_startwith('Pi') + \ fr.get_cat_startwith('Pf') + \ fr.get_cat_startwith('Po') enc = encoder_tones() with fileReader.utf8_open(args.outfile, 'w') as fidout: for sent in predicted_set: for token in sent: form = u'' for syllabe in token: #if type(syllabe[0]) == type(cara_to_ignore) : # print "good syllable type" #else : # print "bad syllable type" # syllabe[0], syllabe[1] -> token by chunk, label by chunk if syllabe[0] in cara_to_ignore: form += syllabe[0] else: form += enc.differential_decode( syllabe[0], syllabe[1].decode('utf-8')) fidout.write(form) #fidout.write('\n') print u"... done in", get_duration(t1_secs=t1, t2_secs=time.time())
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-r', '--root', help='Corpus root dir') aparser.add_argument('-f', '--filelist', help='Path to a list of files to learn from') # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument('-e', '--evalsize', type=int, default=10, help='Percent of training data with respect to training and test one (default 10)') aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None) aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true') aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin) aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout) aparser.add_argument('-s', '--store', help='Store tagged raw data in file (.csv) for further research purpose', default=None) args = aparser.parse_args() if args.verbose: print args if args.learn and (args.pos or args.tone or args.gloss): if not (args.pos or args.tone or args.gloss): print 'Choose pos, tone, gloss or combination of them' exit(0) print 'Make list of files' allfiles = [] with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist: for line in filelist: allfiles.append(line.strip()) allsents = [] # pour le débogage # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html' if args.tone: try: enc = encoder_tones() except: enc = None print ("Error : unable to initialize the tone encoder !") print 'Open files and find features / supervision tags' for infile in allfiles: if(infile): print '-', infile sent = [] html_parser = FileParser() html_parser.read_file(os.path.join(args.root, infile)) for snum, sentence in enumerate(html_parser.glosses): for tnum, token in enumerate(sentence[2]): tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: tags = '/'.join(token.gloss.ps).encode('utf-8') wordform = detone(token.gloss.form) sent.append((wordform, tags)) elif args.tone: # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ? # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter # une réélle amélioration dans la modélisation de tonalisation. Néanmoins, # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans # les données d'entraînement et d'en observer le apport if '|' not in token.gloss.form : [codes, chunks] = enc.differential_encode(token.token, token.gloss.form) for chunk, code in zip(chunks, codes) : try : sent.append((chunk, code.encode('utf-8'))) except LookupError: pass """ elif args.gloss: tags += token.gloss.gloss.encode('utf-8') sent.append((token.token, tags)) """ if len(sent) > 1: allsents.append(sent) sent = [] if args.verbose and args.tone: enc.report() # Constitution des ensmebles d'entraînement de d'évaluation p = (1 - args.evalsize / 100.0) train_set, eval_set = sampling(allsents, p) print 'Split the data in train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)' print 'Building classifier (CRF/NLTK)' # Initialization t1 = time.time() if args.tone: num_phases = len([False, True]) * len(mode_indicators) myzip = zipfile.ZipFile(args.learn + '.zip', 'w') else: num_phases = 1 # Training for phase in range(num_phases): tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) trainer = pycrfsuite.Trainer(verbose = tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) else: model_name = args.learn # train_set : list(list((str,list(str)))) for sent in train_set: tokens = unzip(sent)[0] labels = unzip(sent)[1] if num_phases > 1: for lab in labels: pass labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels] features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))] trainer.append(features, labels) trainer.train(model = model_name) if num_phases > 1: myzip.write(model_name) os.remove(model_name) if num_phases > 1: myzip.close() print "... done in", get_duration(t1_secs=t1, t2_secs=time.time()) # Evaluation print 'Evaluating classifier' # gold_set, predicted_set : list(list((str, str))) # input_set, output_gold_set : list(list(str)) gold_set = eval_set input_set = [unzip(sent)[0] for sent in gold_set] predicted_set = [list() for sent in gold_set] if num_phases > 1: myzip = zipfile.ZipFile(args.learn + '.zip', 'r') for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq' : 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) myzip.extract(model_name) else: model_name = args.learn tagger.set_model_file(model_name) for i, sent in enumerate(input_set): features = [_get_features_customised_for_tones(sent,j) for j in range(len(sent))] labels = tagger._tagger.tag(features) if num_phases > 1: labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels] tagged_sent = list(zip(sent, labels)) if not predicted_set[i]: predicted_set[i] = tagged_sent else: sent_acc, labels_acc = unzip(predicted_set[i]) labels_acc = [label_acc + label for label_acc, label in zip(labels_acc, labels)] predicted_set[i] = list(zip(sent_acc, labels_acc)) if num_phases > 1: os.remove(model_name) myzip.close() # gold_tokens, predicted_tokens : list((str,str)) predicted_tokens = list(itertools.chain(*predicted_set)) if num_phases > 1: predicted_tokens = [ tuple([pair[0], code_resort(pair[1].decode('utf-8')).encode('utf-8')]) for pair in predicted_tokens] gold_tokens = list(itertools.chain(*gold_set)) # gold_tokens_eval, predicted_tokens_eval : list(str) if args.tone: gold_tokens_eval = getTag(gold_tokens) predicted_tokens_eval = getTag(predicted_tokens) else: gold_tokens_eval = gold_tokens predicted_tokens_eval = predicted_tokens if args.store and args.tone: stored_filename = args.store csv_export(enc, stored_filename, gold_tokens, predicted_tokens) print "Accuracy : {:>5.3f}".format(accuracy(gold_tokens_eval, predicted_tokens_eval)) if args.verbose and args.store: print ("Tagged result is exported in {}".format(args.store)) elif args.disambiguate and args.infile and args.outfile: # Lecture de texte en .HTML html_parser = FileParser() tagger = CRFTagger() if args.pos: try: tagger.set_model_file(args.disambiguate) except IOError: print "Error : unable to open the model {} !".format(args.infile) exit(1) try: html_parser.read_file(args.infile) except IOError: print "Error : unable to open the input file {} !".format(args.infile) exit(1) # Exportation du résultat de désambiguïsation en .HTML for snum, sentence in enumerate(html_parser.glosses): tokens = [token.token for token in sentence[2]] features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))] tagger._tagger.set(features) for tnum, token in enumerate(sentence[2]): options = list() if token.value and len(token.value) > 2: for nopt, option in enumerate(token.value[2]): try: tag = option.ps[0] except IndexError: tag = '' prob = tagger._tagger.marginal(tag, tnum) options.append((prob, option)) reordered_probs, reordered_options = unzip(sorted(options, reverse = True)) if args.select: prob_max = reordered_probs[0] reordered_options = tuple([ reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max]) html_parser.glosses[snum][1][tnum] = reordered_options elif args.tone: pass try: html_parser.write(args.outfile) except IOError: print "Error : unable to create the output file {}".format(args.outfile) else: aparser.print_help() exit(0)
y = np.array(y) y_hat = np.array(y_hat) print("hmm acc : ", (y == y_hat).mean()) #named entities recognition import pickle a = pickle.load( open( "/users/Etu0/3770640/M1/Sem2/TAL/TME1/maxent_ne_chunker/PY3/english_ace_multiclass.pickle", "rb")) from nltk.tag.crf import CRFTagger tagger = CRFTagger() tagger.train(alldocs, u'crf.model' ) # donner en plus le fichier de stockage du calcul des features tagger.tag(['Je suis à la maison']) print(tagger._get_features([u"Je"], 0)) from nltk.tag.perceptron import PerceptronTagger tagger = PerceptronTagger(load=False) tagger.train(alldocs) # adT_seq: liste de liste de mots (=liste de phrase) allpred_smart = [[t for w, t in tagger.tag(adT_seq[i])] for i in range(len(adT_seq))] allpred_stupid = [[tagger.tag([w])[0][1] for w in adT_seq[i]] for i in range(len(adT_seq))]
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') # aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin") # aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout") aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10) aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') args = aparser.parse_args() if args.learn: if not args.pos or args.tone or args.gloss: print 'Choose pos, tone, gloss or combination of them' exit(0) print 'Make list of files' files1 = glob.iglob("../corbama/*/*.dis.html") files2 = glob.iglob("../corbama/*.dis.html") allfiles = "" for file1, file2 in zip(files1, files2): allfiles += file1+','+file2+',' allsents = [] print 'Open files and find features / supervision tags' for infile in allfiles.split(','): if(len(infile)) : print '-', infile sent = [] in_handler = formats.HtmlReader(infile, compatibility_mode=False) for token in in_handler: tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: for ps in token.gloss.ps: tags += ps if args.tone: tags += token.gloss.form.encode('utf-8') if args.gloss: tags += token.gloss.gloss.encode('utf-8') sent.append((token.token, tags)) if token.type == 'c' and token.token in ['.', '?', '!']: if len(sent) > 1: allsents.append(sent) sent = [] datalength = len(allsents) p = (1-args.evalsize/100.0) print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)' random.seed(123456) random.shuffle(allsents) train_set = allsents[:int(p*datalength)] test_set = allsents[int(p*datalength):datalength] print 'Building classifier (CRF/NLTK)' tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) t1 = time.time() tagger.train(train_set, args.learn) t2 = time.time() texec = t2-t1 print "... done in", time.strftime('%H %M %S', time.localtime(texec)) print 'Evaluating classifier' tagger.evaluate(test_set) if args.verbose: print 'Compute detailed output' else: print 'USE...' exit(0)
def main(positive, death): ############# Compile the dataset ############### ## Load the dataset text = list() response = list() file_path = [positive, death] for path in file_path: input_file = jsonlines.open(path) for obj in input_file: text.append(obj['text']) response.append(obj['annotation']['part1.Response']) ## Tweet Preprocessing prep_text = list() for i in text: prep_text.append(p.clean(i)) ## Tag Keywords and Create Labels ### Focus on verbs--therefore, try lemmatization first wnl = WordNetLemmatizer() n_corpus = len(prep_text) token_data = ["test"] * n_corpus n = 0 for sent in prep_text: token_data[n] = [ wnl.lemmatize(i, j[0].lower()) if j[0].lower() in ['a', 'n', 'v'] else wnl.lemmatize(i) for i, j in pos_tag(word_tokenize(sent)) ] n = n + 1 ### Create labels death_list = ["die", "dead", "death", "pass", "away"] n = 0 for sent in token_data: for idx, token in enumerate(sent): if ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "P-Yes"] elif ((token.lower() in ["test", "positive", "result"]) and (response[n] == ["no"])): sent[idx] = [sent[idx], "P-No"] elif ((token.lower() in death_list) and (response[n] == ["yes"])): sent[idx] = [sent[idx], "D-Yes"] elif ((token.lower() in death_list) and (response[n] == ["no"])): sent[idx] = [sent[idx], "D-No"] else: sent[idx] = [sent[idx], "Irr"] n = n + 1 ## Shuffle and split into train data and dev data token_data = shuffle(token_data, random_state=6) train_data, dev_data = train_test_split(token_data, test_size=0.3, random_state=616) print( f"The number of sentences in training data: {len(train_data)}; The number of sentences in dev data: {len(dev_data)};" ) ############# Fit A CRF Model And Predict ############### condition_to_func = { "base": my_features, "include_neighbors": neighbor_features } for cond, func in condition_to_func.items(): # initialize crf = CRFTagger(feature_func=func) crf.train(train_data, 'model.tagger') # Test crf._feature_func(prep_text[0].split(), 7) crf.tag_sents([['I', 'get', 'covid'], ['he', 'test', 'positive']]) # Output filename = cond + "_final_output.tsv" with open(filename, 'w') as pred_file: for sent in dev_data: sent_words = [item[0] for item in sent] gold_tags = [item[1] for item in sent] with_tags = crf.tag(sent_words) for i, output in enumerate(with_tags): original_word, tag_prediction = output line_as_str = f"{original_word}\t{gold_tags[i]}\t{tag_prediction}\n" pred_file.write(line_as_str) # add an empty line after each sentence pred_file.write("\n") ############# Evaluation ############### ## Extract Data with Meaning Labels cond_list = ['base', 'include_neighbors'] for cond in cond_list: filename = cond + "_final_output.tsv" with open(filename) as fd: rd = csv.reader(fd, delimiter="\t", quotechar='"') D_data = [] P_data = [] for row in rd: if len(row) > 1: if row[1] in ['P-Yes', 'P-No']: P_data.append(row) elif row[1] in ['D-Yes', 'D-No']: D_data.append(row) column_name = ['token', 'label', 'prediction'] P_df = pd.DataFrame(P_data, columns=column_name) D_df = pd.DataFrame(D_data, columns=column_name) Total_df = P_df.append(D_df) # Accuracy ## Overall Accuracy T_a = accuracy_score(Total_df['label'], Total_df['prediction']) ## Accuracy, Precision, and Recall for two events accuracy = [] precision = [] recall = [] for df in [P_df, D_df]: accuracy.append(accuracy_score(df['label'], df['prediction'])) precision.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['prediction'][item]))) recall.append( sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item] and 'Yes' in df['prediction'][item])) / sum(1 for item in range(0, len(df) - 1) if ('Yes' in df['label'][item]))) ## F-1 f1 = [] for num in [0, 1]: f1.append((2 * precision[num] * recall[num]) / (precision[num] + recall[num])) # Report performance print("condition: " + cond) print(f"Overall Accuracy {T_a:0.03}") covid_event = ['Test Positive', 'Death Case'] num = 0 for event in covid_event: print( f"Scores for {event} : \taccuracy {accuracy[num]:0.03}\tprecision {precision[num]:0.03}\trecall {recall[num]:0.03}\tF1 {f1[num]:0.03}" ) num = num + 1 ## Basicline Performance / Confusion Matrix print("Confusion Matrix:") print(pd.crosstab(Total_df['label'], Total_df['prediction'])) print("Training data:") labels = ["P-Yes", "P-No", "D-Yes", "D-No"] for label in labels: train_data2 = np.concatenate(train_data).flat n_label = sum(1 for item in train_data2 if item == label) print(f"Number of {label}: {n_label}")