Beispiel #1
0
 def lookup_gloss(self, gloss, gdict):
     "Gloss, Dictionary -> tuple(Gloss)"
     lookup_form = None
     try:
         if self.detone:
             bare = detone(gloss.form)
             if bare in gdict:
                 lookup_form = bare
         else:
             if gloss.form in gdict:
                 lookup_form = gloss.form
             else:
                 bare = detone(gloss.form)
                 if not gloss.form == bare and bare in gdict:
                     lookup_form = bare
         if lookup_form:
             pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss)
             return tuple([dgloss for dgloss in gdict[lookup_form] if dgloss.matches(pattern)])
         else:
             return ()
     except (KeyError, AttributeError):
         if gloss.form in gdict:
             print "PP", gloss.form, gdict[gloss.form]
         else:
             print "PN", gloss.form
         return ()
Beispiel #2
0
 def lookup_gloss(self, gloss, gdict):
     'Gloss, Dictionary -> tuple(Gloss)'
     lookup_form = None
     parts = None
     try:
         if self.detone:
             bare = detone(gloss.form) 
             if bare in gdict:
                 lookup_form = bare
         else:
             if gloss.form in gdict:
                 lookup_form = gloss.form
             elif '-' in gloss.form:
                 parts = gloss.form.split('-')
                 lookup_form = ''.join(parts)
             else:
                 bare = detone(gloss.form)
                 if not gloss.form == bare and bare in gdict:
                     lookup_form = bare
         if lookup_form:
             pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss)
             if parts:
                 out = []
                 for dgloss in gdict[lookup_form]:
                     if dgloss.matches(pattern) and len(dgloss.morphemes) == len(parts) and tones_match(gloss.form, dgloss.form):
                         out.append(dgloss)
                 return tuple(out)
             else:
                 return tuple([dgloss for dgloss in gdict[lookup_form] if dgloss.matches(pattern) and tones_match(gloss.form, dgloss.form)])
         else:
             return ()
     except (KeyError,AttributeError):
         if gloss.form in gdict:
             print 'PP', gloss.form, gdict[gloss.form]
         return ()
Beispiel #3
0
def push_items(d, l, ps=frozenset([]), ge=''):
    for k, i in l:
        lx = i._replace(ps=set([ps]),gloss=ge)
        d.setdefault(k,[]).append(lx)
        detoned = detone(k)
        if not detoned == k:
            d.setdefault(detoned,[]).append(lx)
Beispiel #4
0
def push_items(d, l, ps=frozenset([]), ge=''):
    for k, i in l:
        lx = i._replace(ps=set([ps]), gloss=ge)
        d.setdefault(k, []).append(lx)
        detoned = detone(k)
        if not detoned == k:
            d.setdefault(detoned, []).append(lx)
Beispiel #5
0
def make_lemmafunc(args):
    if args.tonal:
        get_lemma = lambda x: dedot(x)
    elif args.nullify:
        nullify_dict={u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'}
        def get_lemma(x):
            x = detone(''.join(c for c in x if c not in '.'))
            for source, target in nullify_dict.items():
                x = x.replace(source, target)
            return x
    else:
        get_lemma = lambda x: detone(dedot(x))
    return get_lemma
Beispiel #6
0
 def lookup_gloss(self, gloss, gdict):
     'Gloss, Dictionary -> tuple(Gloss)'
     lookup_form = None
     parts = None
     try:
         if self.detone:
             bare = detone(gloss.form)
             if bare in gdict:
                 lookup_form = bare
         else:
             if gloss.form in gdict:
                 lookup_form = gloss.form
             elif '-' in gloss.form:
                 parts = gloss.form.split('-')
                 lookup_form = ''.join(parts)
             else:
                 bare = detone(gloss.form)
                 if not gloss.form == bare and bare in gdict:
                     lookup_form = bare
         if lookup_form:
             pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss)
             if parts:
                 out = []
                 for dgloss in gdict[lookup_form]:
                     if dgloss.matches(pattern) and len(
                             dgloss.morphemes) == len(parts):
                         out.append(dgloss)
                 return tuple(out)
             else:
                 return tuple([
                     dgloss for dgloss in gdict[lookup_form]
                     if dgloss.matches(pattern)
                 ])
         else:
             return ()
     except (KeyError, AttributeError):
         if gloss.form in gdict:
             print 'PP', gloss.form, gdict[gloss.form]
         return ()
Beispiel #7
0
def make_lemmafunc(args):
    if args.tonal:
        get_lemma = lambda x: dedot(x)
    elif args.nullify:
        nullify_dict = {u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'}

        def get_lemma(x):
            x = detone(''.join(c for c in x if c not in '.'))
            for source, target in nullify_dict.items():
                x = x.replace(source, target)
            return x
    else:
        get_lemma = lambda x: detone(dedot(x))
    return get_lemma
Beispiel #8
0
def main():
    aparser = argparse.ArgumentParser(description='Daba disambiguator')
    aparser.add_argument('-v',
                         '--verbose',
                         help='Verbose output',
                         default=False,
                         action='store_true')
    aparser.add_argument(
        '-l',
        '--learn',
        help='Learn model from data (and save as F if provided)',
        default=None)
    aparser.add_argument('-p',
                         '--pos',
                         help='Prediction for POS',
                         default=False,
                         action='store_true')
    aparser.add_argument('-t',
                         '--tone',
                         help='Prediction for tones',
                         default=False,
                         action='store_true')
    aparser.add_argument('-r', '--root', help='Corpus root dir')
    aparser.add_argument('-f',
                         '--filelist',
                         help='Path to a list of files to learn from')
    # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
    aparser.add_argument(
        '-e',
        '--evalsize',
        type=int,
        default=10,
        help=
        'Percent of training data with respect to training and test one (default 10)'
    )
    aparser.add_argument(
        '-d',
        '--disambiguate',
        help=
        'Use model F to disambiguate data, the gloss list will be ordered by the probability growth order',
        default=None)
    aparser.add_argument(
        '--select',
        help=
        'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.',
        action='store_true')
    aparser.add_argument('-i',
                         '--infile',
                         help='Input file (.html)',
                         default=sys.stdin)
    aparser.add_argument('-o',
                         '--outfile',
                         help='Output file (.html)',
                         default=sys.stdout)
    aparser.add_argument(
        '-s',
        '--store',
        help=
        'Store tagged raw data in file (.csv) for further research purpose',
        default=None)

    args = aparser.parse_args()
    if args.verbose:
        print(args)

    if args.learn and (args.pos or args.tone or args.gloss):

        if not (args.pos or args.tone or args.gloss):
            print('Choose pos, tone, gloss or combination of them')
            exit(0)

        print('Make list of files')
        allfiles = []
        with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist:
            for line in filelist:
                allfiles.append(line.strip())
        allsents = []

        # pour le débogage
        # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'

        if args.tone:
            try:
                enc = encoder_tones()
            except:
                enc = None
                print(("Error : unable to initialize the tone encoder !"))

        print('Open files and find features / supervision tags')
        for infile in allfiles:
            if (infile):
                print('-', infile)
                sent = []

                html_parser = FileParser()
                html_parser.read_file(os.path.join(args.root, infile))

                for snum, sentence in enumerate(html_parser.glosses):
                    for tnum, token in enumerate(sentence[2]):
                        tag = ''
                        if token.type == 'w' or token.type == 'c':
                            tags = ''
                            if args.pos:
                                tags = '/'.join(token.gloss.ps)
                                wordform = detone(token.gloss.form)
                                sent.append((wordform, tags))
                            elif args.tone:
                                # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ?
                                # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est
                                # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter
                                # une réélle amélioration dans la modélisation de tonalisation. Néanmoins,
                                # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans
                                # les données d'entraînement et d'en observer le apport
                                if '|' not in token.gloss.form:
                                    [codes, chunks] = enc.differential_encode(
                                        token.token, token.gloss.form)
                                    for chunk, code in zip(chunks, codes):
                                        try:
                                            sent.append((chunk, code))
                                        except LookupError:
                                            pass
                            """
                            elif args.gloss:
                                tags += token.gloss.gloss
                                sent.append((token.token, tags))
                            """

                    if len(sent) > 1:
                        allsents.append(sent)
                        sent = []

        if args.verbose and args.tone:
            enc.report()

        # Constitution des ensmebles d'entraînement de d'évaluation
        p = (1 - args.evalsize / 100.0)
        train_set, eval_set = sampling(allsents, p)
        print('Split the data in train (', len(train_set),
              ' sentences) / test (', len(eval_set), ' sentences)')

        print('Building classifier (CRF/NLTK)')
        # Initialization
        t1 = time.time()
        if args.tone:
            num_phases = len([False, True]) * len(mode_indicators)
            myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
        else:
            num_phases = 1

        # Training
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose,
                               training_opt={'feature.minfreq': 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
            else:
                model_name = args.learn

            # train_set : list(list((str,list(str))))
            for sent in train_set:
                tokens = unzip(sent)[0]
                labels = unzip(sent)[1]
                if num_phases > 1:
                    for lab in labels:
                        pass
                    labels = [
                        code_dispatcher(label)[phase] for label in labels
                    ]
                features = [
                    _get_features_customised_for_tones(tokens, i)
                    for i in range(len(tokens))
                ]
                trainer.append(features, labels)
            trainer.train(model=model_name)
            if num_phases > 1:
                myzip.write(model_name)
                os.remove(model_name)
        if num_phases > 1:
            myzip.close()

        print("... done in", get_duration(t1_secs=t1, t2_secs=time.time()))

        # Evaluation
        print('Evaluating classifier')
        # gold_set, predicted_set : list(list((str, str)))
        # input_set, output_gold_set : list(list(str))
        gold_set = eval_set
        input_set = [unzip(sent)[0] for sent in gold_set]
        predicted_set = [list() for sent in gold_set]
        if num_phases > 1:
            myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose,
                               training_opt={'feature.minfreq': 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
                myzip.extract(model_name)
            else:
                model_name = args.learn
            tagger.set_model_file(model_name)
            for i, sent in enumerate(input_set):
                features = [
                    _get_features_customised_for_tones(sent, j)
                    for j in range(len(sent))
                ]
                labels = tagger._tagger.tag(features)
                if num_phases > 1:
                    labels = [
                        code_dispatcher(label)[phase] for label in labels
                    ]
                tagged_sent = list(zip(sent, labels))
                if not predicted_set[i]:
                    predicted_set[i] = tagged_sent
                else:
                    sent_acc, labels_acc = unzip(predicted_set[i])
                    labels_acc = [
                        label_acc + label
                        for label_acc, label in zip(labels_acc, labels)
                    ]
                    predicted_set[i] = list(zip(sent_acc, labels_acc))
            if num_phases > 1:
                os.remove(model_name)
                myzip.close()

        # gold_tokens, predicted_tokens : list((str,str))
        predicted_tokens = list(itertools.chain(*predicted_set))
        if num_phases > 1:
            predicted_tokens = [
                tuple([pair[0], code_resort(pair[1])])
                for pair in predicted_tokens
            ]
        gold_tokens = list(itertools.chain(*gold_set))
        # gold_tokens_eval, predicted_tokens_eval : list(str)
        if args.tone:
            gold_tokens_eval = getTag(gold_tokens)
            predicted_tokens_eval = getTag(predicted_tokens)
        else:
            gold_tokens_eval = gold_tokens
            predicted_tokens_eval = predicted_tokens

        if args.store and args.tone:
            stored_filename = args.store
            csv_export(enc, stored_filename, gold_tokens, predicted_tokens)

        print("Accuracy : {:>5.3f}".format(
            accuracy(gold_tokens_eval, predicted_tokens_eval)))

        if args.verbose and args.store:
            print(("Tagged result is exported in {}".format(args.store)))

    elif args.disambiguate and args.infile and args.outfile:
        # Lecture de texte en .HTML
        html_parser = FileParser()
        tagger = CRFTagger()

        if args.pos:
            try:
                tagger.set_model_file(args.disambiguate)
            except IOError:
                print("Error : unable to open the model {} !".format(
                    args.infile))
                exit(1)
            try:
                html_parser.read_file(args.infile)
            except IOError:
                print("Error : unable to open the input file {} !".format(
                    args.infile))
                exit(1)

            # Exportation du résultat de désambiguïsation en .HTML
            for snum, sentence in enumerate(html_parser.glosses):
                tokens = [token.token for token in sentence[2]]
                features = [
                    _get_features_customised_for_tones(tokens, i)
                    for i in range(len(tokens))
                ]
                tagger._tagger.set(features)
                for tnum, token in enumerate(sentence[2]):
                    options = list()
                    if token.value and len(token.value) > 2:
                        for nopt, option in enumerate(token.value[2]):
                            try:
                                tag = option.ps[0]
                            except IndexError:
                                tag = ''
                            prob = tagger._tagger.marginal(tag, tnum)
                            options.append((prob, option))
                        reordered_probs, reordered_options = unzip(
                            sorted(options, reverse=True))
                        if args.select:
                            prob_max = reordered_probs[0]
                            reordered_options = tuple([
                                reordered_options[i]
                                for i, p in enumerate(reordered_probs)
                                if p >= prob_max
                            ])
                        html_parser.glosses[snum][1][tnum] = reordered_options

        elif args.tone:
            pass

        try:
            html_parser.write(args.outfile)
        except IOError:
            print("Error : unable to create the output file {}".format(
                args.outfile))

    else:
        aparser.print_help()
    exit(0)
Beispiel #9
0
 def push_items(primarykey, lemmalist):
     for key, lx in lemmalist:
         self._dict[key] = lx
         detonedkey = detone(key)
         if not detonedkey == key:
             self._dict[detonedkey] = lx
Beispiel #10
0
    def __init__(self, filename, encoding='utf-8', store=True, variants=False, polisemy=False):

        self._dict = DabaDict()
        self._variants = VariantsDict()
        self._polisemy = defaultdict(ddlist)
        self.line = 0
        lemmalist = []
        key = None
        ps = ()
        ge = ''

        def parsemm(v):
            try:
                f, p, g = v.split(':')
                if p:
                    ps = tuple(p.split('/'))
                else:
                    ps = ()
                return Gloss(f, ps, g, ())
            except (ValueError):
                print "Error line:", str(self.line), unicode(v).encode('utf-8')

        def normalize(value): 
            return normalizeText(value.translate({ord(u'.'):None,ord(u'-'):None}).lower())

        def make_item(value):
            return [normalize(value), Gloss(form=value,ps=(),gloss="",morphemes=())]

        def push_items(primarykey, lemmalist):
            for key, lx in lemmalist:
                self._dict[key] = lx
                detonedkey = detone(key)
                if not detonedkey == key:
                    self._dict[detonedkey] = lx
        
        with codecs.open(filename, 'r', encoding=encoding) as dictfile:
            for line in dictfile:
                self.line = self.line + 1
                # end of the artice/dictionary
                if not line or line.isspace():
                    lemmalist = [(key, item._replace(ps=ps,gloss=ge)) for key, item in lemmalist]
                    if lemmalist and not ps == ('mrph',):
                        if store:
                            push_items(key, lemmalist)
                        if variants and len(lemmalist) > 1:
                            self._variants.add(zip(*lemmalist)[1])

                    lemmalist = []
                    ps = ()
                    ge = ''
                    key = None

                elif line.startswith('\\'):
                    tag, space, value = line[1:].partition(' ')
                    value = value.strip()
                    if tag in ['lang', 'ver', 'name']:
                        self._dict.__setattr__(tag, value)
                    elif tag in ['lx', 'le', 'va', 'vc']:
                        key = normalize(value)
                        lemmalist.append(make_item(value))
                    elif tag in ['mm']:
                        lemmalist[-1][1] = lemmalist[-1][1]._replace(morphemes=lemmalist[-1][1].morphemes+(parsemm(value),))
                    elif tag in ['ps'] and not ps:
                        if value:
                            ps = tuple(value.split('/'))
                        else:
                            ps = ()
                    elif tag in ['gf', 'ge'] and not ge:
                        ge = value
                    elif tag in ['gv']:
                        if polisemy:
                            self._polisemy[key][ge].append(value)
                            dk = detone(key)
                            if not dk == key:
                                self._polisemy[dk][ge].append(value)
                else:
                    if lemmalist:
                        if store:
                            push_items(key, lemmalist)
                        if variants:
                            self._variants.add(zip(*lemmalist)[1])

            if not self._dict.attributed():
                print r"Dictionary does not contain obligatory \lang, \name or \ver fields.\
                        Please specify them and try to load again."
                print self._dict.lang, self._dict.name, self._dict.ver
Beispiel #11
0
 def push_items(primarykey, lemmalist):
     for key, lx in lemmalist:
         self._dict[key] = lx
         detonedkey = detone(key)
         if not detonedkey == key:
             self._dict[detonedkey] = lx
Beispiel #12
0
    def __init__(self,
                 filename,
                 encoding='utf-8',
                 store=True,
                 variants=False,
                 polisemy=False):

        self._dict = DabaDict()
        self._variants = VariantsDict()
        self._polisemy = defaultdict(ddlist)
        self.line = 0
        lemmalist = []
        key = None
        ps = ()
        ge = ''

        def parsemm(v):
            try:
                f, p, g = v.split(':')
                if p:
                    ps = tuple(p.split('/'))
                else:
                    ps = ()
                return Gloss(f, ps, g, ())
            except (ValueError):
                print "Error line:", str(self.line), unicode(v).encode('utf-8')

        def normalize(value):
            return normalizeText(
                value.translate({
                    ord(u'.'): None,
                    ord(u'-'): None
                }).lower())

        def make_item(value):
            return [
                normalize(value),
                Gloss(form=value, ps=(), gloss="", morphemes=())
            ]

        def push_items(primarykey, lemmalist):
            for key, lx in lemmalist:
                self._dict[key] = lx
                detonedkey = detone(key)
                if not detonedkey == key:
                    self._dict[detonedkey] = lx

        def process_record(lemmalist):
            lemmalist = [(key, item._replace(ps=ps, gloss=ge))
                         for key, item in lemmalist]
            if lemmalist and not ps == ('mrph', ):
                if store:
                    push_items(key, lemmalist)
                if variants and len(lemmalist) > 1:
                    self._variants.add(zip(*lemmalist)[1])

        with codecs.open(filename, 'r', encoding=encoding) as dictfile:
            for line in dictfile:
                self.line = self.line + 1
                # end of the artice/dictionary
                if not line or line.isspace():
                    process_record(lemmalist)
                    lemmalist = []
                    ps = ()
                    ge = ''
                    key = None
                    seengf = False
                    seenge = False
                elif line.startswith('\\'):
                    tag, space, value = line[1:].partition(' ')
                    value = value.strip()
                    if tag in ['lang', 'ver', 'name']:
                        self._dict.__setattr__(tag, value)
                    elif tag in ['lx', 'le', 'va', 'vc']:
                        key = normalize(value)
                        lemmalist.append(make_item(value))
                    elif tag in ['mm']:
                        lemmalist[-1][1] = lemmalist[-1][1]._replace(
                            morphemes=lemmalist[-1][1].morphemes +
                            (parsemm(value), ))
                    elif tag in ['ps'] and not ps:
                        if value:
                            ps = tuple(value.split('/'))
                        else:
                            ps = ()
                    elif tag in ['gf'] and not seengf:
                        ge = value
                        seengf = True
                    elif tag in ['ge'] and not seenge:
                        if not seengf:
                            ge = value
                            seenge = True
                    elif tag in ['gv']:
                        if polisemy:
                            self._polisemy[key][ge].append(value)
                            dk = detone(key)
                            if not dk == key:
                                self._polisemy[dk][ge].append(value)
            else:
                process_record(lemmalist)

            if not self._dict.attributed():
                print r"Dictionary does not contain obligatory \lang, \name or \ver fields.\
                        Please specify them and try to load again."

                print self._dict.lang, self._dict.name, self._dict.ver
Beispiel #13
0
 def get_lemma(x):
     x = detone(''.join(c for c in x if c not in '.'))
     for source, target in nullify_dict.items():
         x = x.replace(source, target)
     return x
Beispiel #14
0
 def get_lemma(x):
     x = detone(''.join(c for c in x if c not in '.'))
     for source, target in nullify_dict.items():
         x = x.replace(source, target)
     return x
Beispiel #15
0
def main():
    aparser = argparse.ArgumentParser(description='Lexicon printer for TreeTagger training')
    aparser.add_argument("-r", "--runtimedir", help="Runtime dir with binary saved dictionaries")
    aparser.add_argument("-t", "--tonal", action="store_true", help="Preserve tones on word forms")
    aparser.add_argument("-j", "--join", action="store_true", help="Join all sources")
    aparser.add_argument("-p", "--plain", action="store_true", help="Output plain lists of tokens")
    aparser.add_argument("-c", "--corpus", default=None, help="Corpus root")
    aparser.add_argument("-g", "--glob", default="*.pars.html", help="Filename pattern for search in the corpus dir")
    args = aparser.parse_args()

    #locale.setlocale(locale.LC_ALL, 'bm_ML')

    if args.join:
        globaldict = defaultdict(list)

    if args.corpus:
        seentokens = set()
        parsfiles = []
        for root, dirnames, filenames in os.walk(args.corpus):
            for filename in fnmatch.filter(filenames, args.glob):
                parsfile = os.path.join(root, filename)
                reader = formats.HtmlReader(parsfile)
                lastpunct = None
                for token in reader:
                    if token.type == 'w':
                        if lastpunct:
                            print_line(lastpunct.value, [' '.join([lastpunct.type, lastpunct.value])])
                            lastpunct = None
                        form = dedot(token.glosslist[0].form).lower()
                        if not args.tonal:
                            form = detone(form)
                        else:
                            # FIXME: unsupported tonal for corpus
                            pass
                        if args.plain:
                            result = make_taglist(token.glosslist)
                            print_line(form, result)
                        else:
                            if form not in seentokens:
                                result = make_taglist(token.glosslist)
                                seentokens.add(form)
                                if args.join:
                                    globaldict[form].extend(result)
                                else:
                                    print_line(form, result)
                    elif token.type == 'c':
                        lastpunct = token
                    elif token.type == 's':
                        if lastpunct:
                            print_line(lastpunct.value, [' '.join(['SENT', lastpunct.value])])
                            lastpunct = None


    if args.runtimedir:
        seenkeys = set()
        dictionary = mparser.DictLoader(runtimedir=args.runtimedir).dictionary
        for form in dictionary:
            if ' ' not in form:
                if not args.tonal:
                    form = detone(form)
                if args.plain:
                    for gloss in dictionary[form]:
                        print gloss
                        result = make_taglist([gloss], formforlemma=True)
                        for lemma in result:
                            print_line(form, [lemma])
                else:
                    if args.corpus and form in seentokens:
                        continue
                    if form not in seenkeys:
                        glosses = dictionary[form]
                        result = make_taglist(glosses, formforlemma=True)
                        seenkeys.add(form)
                        if args.join:
                            globaldict[form].extend(result)
                        else:
                            print_line(form, result)

    if args.join:
        for form, result in globaldict.iteritems():
            print_line(form, result)
Beispiel #16
0
def main():
    aparser = argparse.ArgumentParser(description='Daba disambiguator')
    aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
    aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
    aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
    aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
    aparser.add_argument('-r', '--root', help='Corpus root dir')
    aparser.add_argument('-f', '--filelist', help='Path to a list of files to learn from')
    # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
    aparser.add_argument('-e', '--evalsize', type=int, default=10,
                         help='Percent of training data with respect to training and test one (default 10)')
    aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None)
    aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true')
    aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin)
    aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout)
    aparser.add_argument('-s', '--store', help='Store tagged raw data in file (.csv) for further research purpose', default=None)

    args = aparser.parse_args()
    if args.verbose:
        print args

    if args.learn and (args.pos or args.tone or args.gloss):

        if not (args.pos or args.tone or args.gloss):
            print 'Choose pos, tone, gloss or combination of them'
            exit(0)

        print 'Make list of files'
        allfiles = []
        with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist:
            for line in filelist:
                allfiles.append(line.strip())
        allsents = []

        # pour le débogage
        # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html'

        if args.tone:
            try:
                enc = encoder_tones()
            except:
                enc = None
                print ("Error : unable to initialize the tone encoder !")

        print 'Open files and find features / supervision tags'
        for infile in allfiles:
            if(infile):
                print '-', infile
                sent = []

                html_parser = FileParser()
                html_parser.read_file(os.path.join(args.root, infile))

                for snum, sentence in enumerate(html_parser.glosses):
                    for tnum, token in enumerate(sentence[2]):
                        tag = ''
                        if token.type == 'w' or token.type == 'c':
                            tags = ''
                            if args.pos:
                                tags = '/'.join(token.gloss.ps).encode('utf-8')
                                wordform = detone(token.gloss.form)
                                sent.append((wordform, tags))
                            elif args.tone:
                                # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ?
                                # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est
                                # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter
                                # une réélle amélioration dans la modélisation de tonalisation. Néanmoins,
                                # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans
                                # les données d'entraînement et d'en observer le apport
                                if '|' not in token.gloss.form :
                                    [codes, chunks] = enc.differential_encode(token.token, token.gloss.form)
                                    for chunk, code in zip(chunks, codes) :
                                        try : sent.append((chunk, code.encode('utf-8')))
                                        except LookupError: pass
                            """
                            elif args.gloss:
                                tags += token.gloss.gloss.encode('utf-8')
                                sent.append((token.token, tags))
                            """

                    if len(sent) > 1:
                        allsents.append(sent)
                        sent = []

        if args.verbose and args.tone:
            enc.report()

        # Constitution des ensmebles d'entraînement de d'évaluation
        p = (1 - args.evalsize / 100.0)
        train_set, eval_set = sampling(allsents, p)
        print 'Split the data in train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)'

        print 'Building classifier (CRF/NLTK)'
        # Initialization
        t1 = time.time()
        if args.tone:
            num_phases = len([False, True]) * len(mode_indicators)
            myzip = zipfile.ZipFile(args.learn + '.zip', 'w')
        else:
            num_phases = 1

        # Training
        for phase in range(num_phases):
            tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
            trainer = pycrfsuite.Trainer(verbose = tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
            else:
                model_name = args.learn

            # train_set : list(list((str,list(str))))
            for sent in train_set:
                tokens = unzip(sent)[0]
                labels = unzip(sent)[1]
                if num_phases > 1:
                    for lab in labels:
                        pass
                    labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
                features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
                trainer.append(features, labels)
            trainer.train(model = model_name)
            if num_phases > 1:
                myzip.write(model_name)
                os.remove(model_name)
        if num_phases > 1:
            myzip.close()

        print "... done in", get_duration(t1_secs=t1, t2_secs=time.time())

        # Evaluation
        print 'Evaluating classifier'
        # gold_set, predicted_set : list(list((str, str)))
        # input_set, output_gold_set : list(list(str))
        gold_set = eval_set
        input_set = [unzip(sent)[0] for sent in gold_set]
        predicted_set = [list() for sent in gold_set]
        if num_phases > 1:
            myzip = zipfile.ZipFile(args.learn + '.zip', 'r')
        for phase in range(num_phases):
            tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq' : 10})
            trainer = pycrfsuite.Trainer(verbose=tagger._verbose)
            trainer.set_params(tagger._training_options)
            if num_phases > 1:
                model_name = args.learn + '.' + str(phase)
                myzip.extract(model_name)
            else:
                model_name = args.learn
            tagger.set_model_file(model_name)
            for i, sent in enumerate(input_set):
                features = [_get_features_customised_for_tones(sent,j) for j in range(len(sent))]
                labels = tagger._tagger.tag(features)
                if num_phases > 1:
                    labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels]
                tagged_sent = list(zip(sent, labels))
                if not predicted_set[i]:
                    predicted_set[i] = tagged_sent
                else:
                    sent_acc, labels_acc = unzip(predicted_set[i])
                    labels_acc = [label_acc + label for label_acc, label in zip(labels_acc, labels)]
                    predicted_set[i] = list(zip(sent_acc, labels_acc))
            if num_phases > 1:
                os.remove(model_name)
                myzip.close()

        # gold_tokens, predicted_tokens : list((str,str))
        predicted_tokens = list(itertools.chain(*predicted_set))
        if num_phases > 1:
            predicted_tokens = [
                tuple([pair[0], code_resort(pair[1].decode('utf-8')).encode('utf-8')])
                for pair in predicted_tokens]
        gold_tokens = list(itertools.chain(*gold_set))
        # gold_tokens_eval, predicted_tokens_eval : list(str)
        if args.tone:
            gold_tokens_eval = getTag(gold_tokens)
            predicted_tokens_eval = getTag(predicted_tokens)
        else:
            gold_tokens_eval = gold_tokens
            predicted_tokens_eval = predicted_tokens

        if args.store and args.tone:
            stored_filename = args.store
            csv_export(enc, stored_filename, gold_tokens, predicted_tokens)

        print "Accuracy : {:>5.3f}".format(accuracy(gold_tokens_eval, predicted_tokens_eval))

        if args.verbose and args.store:
            print ("Tagged result is exported in {}".format(args.store))

    elif args.disambiguate and args.infile and args.outfile:
        # Lecture de texte en .HTML
        html_parser = FileParser()
        tagger = CRFTagger()

        if args.pos:
            try:
                tagger.set_model_file(args.disambiguate)
            except IOError:
                print "Error : unable to open the model {} !".format(args.infile)
                exit(1)
            try:
                html_parser.read_file(args.infile)
            except IOError:
                print "Error : unable to open the input file {} !".format(args.infile)
                exit(1)

            # Exportation du résultat de désambiguïsation en .HTML
            for snum, sentence in enumerate(html_parser.glosses):
                tokens = [token.token for token in sentence[2]]
                features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))]
                tagger._tagger.set(features)
                for tnum, token in enumerate(sentence[2]):
                    options = list()
                    if token.value and len(token.value) > 2:
                        for nopt, option in enumerate(token.value[2]):
                            try:
                                tag = option.ps[0]
                            except IndexError:
                                tag = ''
                            prob = tagger._tagger.marginal(tag, tnum)
                            options.append((prob, option))
                        reordered_probs, reordered_options = unzip(sorted(options, reverse = True))
                        if args.select:
                            prob_max = reordered_probs[0]
                            reordered_options = tuple([
                                reordered_options[i]
                                for i, p in enumerate(reordered_probs)
                                if p >= prob_max])
                        html_parser.glosses[snum][1][tnum] = reordered_options

        elif args.tone:
            pass

        try:
            html_parser.write(args.outfile)
        except IOError: print "Error : unable to create the output file {}".format(args.outfile)

    else:
        aparser.print_help()
    exit(0)