Example #1
0
    def __init__(self):

        freeling.util_init_locale("default")

        # Create options set for maco analyzer
        op = freeling.maco_options(LANG)
        op.PunctuationFile = DATA + "common/punct.dat"
        op.DictionaryFile = DATA + LANG + "/es-ar/dicc.src"
        op.AffixFile = DATA + LANG + "/afixos.dat"
        op.LocutionsFile = DATA + LANG + "/locucions.dat"
        op.NPdataFile = DATA + LANG + "/np.dat"
        op.QuantitiesFile = DATA + LANG + "/quantities.dat"
        op.ProbabilityFile = DATA + LANG + "/probabilitats.dat"

        # Create analyzers
        self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = freeling.splitter(DATA + LANG + "/splitter.dat")
        self.mf = freeling.maco(op)

        # create tagger and alternatives
        self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
        self.alts_ort = freeling.alternatives(DATA + LANG +
                                              "/alternatives-ort.dat")

        # known words
        self.wknown = []

        self.sid = self.sp.open_session()
Example #2
0
    def inicia(self):
        FREELINGDIR = "/usr/local"

        DATA = FREELINGDIR + "/share/freeling/"
        LANG = "es"

        freeling.util_init_locale("default")
        # create options set for maco analyzer. Default values are Ok, except for data files.
        op = freeling.maco_options("es")
        op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
        op.set_data_files("", DATA + LANG + "/locucions.dat",
                          DATA + LANG + "/quantities.dat",
                          DATA + LANG + "/afixos.dat",
                          DATA + LANG + "/probabilitats.dat",
                          DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat",
                          DATA + "common/punct.dat",
                          DATA + LANG + "/corrector/corrector.dat")

        # create analyzers
        self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = freeling.splitter(DATA + LANG + "/splitter.dat")
        self.mf = freeling.maco(op)

        self.tg = freeling.hmm_tagger("es", DATA + LANG + "/tagger.dat", 1, 2)
        self.sen = freeling.senses(DATA + LANG + "/senses.dat")
        ner = freeling.ner(DATA + LANG + "/ner/ner-ab.dat")

        self.parser = freeling.chart_parser(DATA + LANG +
                                            "/chunker/grammar-chunk.dat")
        self.dep = freeling.dep_txala(DATA + LANG + "/dep/dependences.dat",
                                      self.parser.get_start_symbol())
Example #3
0
    def __init__(self, language, freeling_dir, options):
        self.logger = logging.getLogger(__name__)
        self.logger.info('FREELING_DIR: %s', freeling_dir)
        self.logger.info('LANGUAGE: %s', language)
        self.logger.info('MAPO_OPTIONS: %s', options)

        freeling.util_init_locale("default")

        # create options set for maco analyzer. Default values are Ok, except for data files.
        data_dir = freeling_dir + "/share/freeling/"
        maco_opts = freeling.maco_options(language)
        maco_opts.set_data_files("", 
                          data_dir + "/common/punct.dat",
                          data_dir + language + "/dicc.src",
                          data_dir + language + "/afixos.dat",
			  "", 
                          data_dir + language + "/locucions.dat",
                          data_dir + language + "/np.dat", 
                          data_dir + language + "/quantities.dat",
                          data_dir + language + "/probabilitats.dat")

        # create analyzers
        self.tokenizer = freeling.tokenizer(data_dir + language + "/tokenizer.dat")
        self.splitter = freeling.splitter(data_dir + language + "/splitter.dat")
        self.maco = freeling.maco(maco_opts)
	self.maco.set_active_options(*options)
Example #4
0
def tagger(file_url, out_name):
    freeling.util_init_locale("default")
    ipath = "/usr/local"
    lpath = ipath + "/share/freeling/" + "es" + "/"
    tk = freeling.tokenizer(lpath + "tokenizer.dat")
    sp = freeling.splitter(lpath + "splitter.dat")
    morfo = freeling.maco(my_maco_options("es", lpath))
    morfo.set_active_options(
        False,  # UserMap 
        True,  # NumbersDetection,  
        True,  # PunctuationDetection,   
        True,  # DatesDetection,    
        True,  # DictionarySearch,  
        True,  # AffixAnalysis,  
        False,  # CompoundAnalysis, 
        True,  # RetokContractions,
        True,  # MultiwordsDetection,  
        True,  # NERecognition,     
        False,  # QuantitiesDetection,  
        True)
    # ProbabilityAssignment

    tagger = freeling.hmm_tagger(lpath + "tagger.dat", True, 2)
    file = open(file_url, "r")
    text = file.read()
    lw = tk.tokenize(text)
    ls = sp.split(lw)
    ls = morfo.analyze(ls)
    ls = tagger.analyze(ls)
    ProcessSentences(ls, out_name)
Example #5
0
def prepare_freeling():    
    # Freeling:
    # https://github.com/TALP-UPC/FreeLing
    # (you may download binary at releases there)
    # (GPP: I'm using 4.0)
    
    # Make sure that the directory contanining libfreeling.so (FREELINGDIR/lib) is
    # in your LD_LIBRARY_PATH.
    
    # Make sure that freeling.py and _freeling.so are in the same directory as this one.
    # example of freeling's python API is at: https://github.com/TALP-UPC/FreeLing/tree/master/APIs/python
    
    # Change directories for your location
    FREELINGDIR = "/usr/local"; 
    DATA = FREELINGDIR+"/share/freeling/";
    
    LANG="pt";
    
    freeling.util_init_locale("default");
    
    # create options set for maco analyzer. Default values are Ok, except for data files.
    op= freeling.maco_options("pt");
    op.set_data_files( "", 
                       DATA + "common/punct.dat",
                       DATA + LANG + "/dicc.src",
                       DATA + LANG + "/afixos.dat",
                       "",
                       DATA + LANG + "/locucions.dat", 
                       DATA + LANG + "/np.dat",
                       "", # there's not "quantitites.dat" for pt 
                       DATA + LANG + "/probabilitats.dat");
    
    # create analyzers
    tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat");
    sp=freeling.splitter(DATA+LANG+"/splitter.dat");
    sid=sp.open_session();
    mf=freeling.maco(op);
    
    # activate mmorpho odules to be used in next call
    mf.set_active_options(False, True, True, True,  # select which among created 
                          True, True, False, True,  # submodules are to be used. 
                          True, True, True, True ); # default: all created submodules are used
    
    # create tagger, sense anotator, and ukb
    
    tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2);
    sen=freeling.senses(DATA+LANG+"/senses.dat");
    parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat");
    ukb = freeling.ukb(DATA+LANG+"/ukb.dat");
    outputter = freeling.output_conll('./output_conll.dat')
    return tk, sp, sid, mf, tg, sen, parser, ukb, outputter
Example #6
0
def setup_tools(config_fn):
    """Setup Freeling tools according to a config file.

        The tools returned is a dictionary with the following keys:
        tk : The tokenizer
        sp : The sentence splitter
        pos : The part of speech tagger
        mf : The morphological analysis tools (freeling.maco)
        wsd : word sense tagger
    """
    config = configparser.ConfigParser()
    config.read(config_fn)
    language = config['wsd']['language']
    data = config['freeling']['datadir']
    data_l = data + '/' + language

    tools = {}
    freeling.util_init_locale("default")
    tools['tk'] = freeling.tokenizer(data_l + "/tokenizer.dat")
    tools['sp'] = freeling.splitter(data_l + "/splitter.dat")
    tools['pos'] = freeling.hmm_tagger(language, data_l + "/tagger.dat",
            True, # Retokenize
            2)    # Force selecting one PoS tag after retokenization
    op = freeling.maco_options(language);
    op.set_active_modules(
        0, # UserMap (for analysis of domain-specific tokens)
        1, # AffixAnalysis
        1, # MultiwordsDetection
        1, # NumbersDetection
        1, # PuctuationDetection
        0, # DatesDetection, gives problems with words like "Monday"
        1, # QuantitiesDetection
        1, # DictionarySearch
        1, # ProbabilityAssignment (Essential for PoS)
        1, # OrthographicCorrection (Misspelling etc.)
        0) # NERecognition (Named Enitity Recognition)
    op.set_data_files(
        "",
        data_l+"/locucions.dat",
        data_l+"/quantities.dat",
        data_l+"/afixos.dat",
        data_l+"/probabilitats.dat",
        data_l+"/dicc.src",
        data_l+"/np.dat",
        data+"/common/punct.dat",
        data_l+"/corrector/corrector.dat");
    tools['mf'] = freeling.maco(op)
    tools['wsd'] = freeling.ukb_wrap(data_l+'/ukb.dat')

    return tools
Example #7
0
    def __init__(self, text):
        super().__init__(text)
        freeling.util_init_locale("default")
        self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat")
        op = freeling.maco_options("es")
        op.set_data_files(
            "",
            DATA + "common/punct.dat",
            DATA + LANG + "/dicc.src",
            DATA + LANG + "/afixos.dat",
            "",
            DATA + LANG + "/locucions.dat",
            DATA + LANG + "/np.dat",
            DATA + LANG + "/quantities.dat",
            DATA + LANG + "/probabilitats.dat"
        )

        # create analyzers
        self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = freeling.splitter(DATA + LANG + "/splitter.dat")
        self.sid = self.sp.open_session()
        self.mf = freeling.maco(op)

        # activate mmorpho odules to be used in next call
        self.mf.set_active_options(
            False,  # umap User map module
            True,  # num Number Detection
            True,  # pun Punctuation Detection
            True,  # dat Date Detection
            True,  # dic Dictionary Search
            True,  # aff
            False,  # com
            True,  # rtk
            True,  # mw Multiword Recognition
            True,  # ner  Name Entity Recongnition
            True,  # qt Quantity Recognition
            True  # prb Probability Assignment And Guesser
        )  # default: all created submodules are used

        # create tagger, sense anotator, and parsers
        self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
        self.sen = freeling.senses(DATA + LANG + "/senses.dat")
        self.parser = freeling.chart_parser(DATA + LANG + "/chunker/grammar-chunk.dat")
        self.dep = freeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat", self.parser.get_start_symbol())
Example #8
0
    def inicializa(self):

        FREELINGDIR = "/usr/local"

        DATA = FREELINGDIR + "/share/freeling/"
        LANG = self.lang

        freeling.util_init_locale("default")

        # create language analyzer
        self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat")

        # opciones para maco analyzer.
        op = freeling.maco_options("es")
        op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
        op.set_data_files("", DATA + LANG + "/locucions.dat",
                          DATA + LANG + "/quantities.dat",
                          DATA + LANG + "/afixos.dat",
                          DATA + LANG + "/probabilitats.dat",
                          DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat",
                          DATA + "common/punct.dat",
                          DATA + LANG + "/corrector/corrector.dat")

        # crear analyzers
        self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = freeling.splitter(DATA + LANG + "/splitter.dat")
        self.mf = freeling.maco(op)

        self.tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", 1, 2)
        self.sen = freeling.senses(DATA + LANG + "/senses.dat")
        self.nec = freeling.nec(DATA + LANG + "/nerc/nec/nec-ab-rich.dat")
        # self.ner=freeling.nec(DATA+LANG+"/ner/ner-ab.dat");

        self.parser = freeling.chart_parser(DATA + LANG +
                                            "/chunker/grammar-chunk.dat")
        self.dep = freeling.dep_txala(DATA + LANG + "/dep/dependences.dat",
                                      self.parser.get_start_symbol())

        con_data={'user':'******','password':'******','host':'127.0.0.1', \
'database':'agiria','raise_on_warnings': True, 'autocommit':True, 'buffered':True}

        self.con = my.connect(**con_data)
Example #9
0
    def inicializa(self):

        FREELINGDIR = "/usr/local";
        
        DATA = FREELINGDIR+"/share/freeling/";
        LANG=self.lang;
        
        freeling.util_init_locale("default");
        
        # create language analyzer
        self.la=freeling.lang_ident(DATA+"common/lang_ident/ident.dat");
        
        # opciones para maco analyzer. 
        op= freeling.maco_options("es");
        op.set_active_modules(0,1,1,1,1,1,1,1,1,1)
        op.set_data_files("",DATA+LANG+"/locucions.dat", DATA+LANG+"/quantities.dat", 
                          DATA+LANG+"/afixos.dat", DATA+LANG+"/probabilitats.dat", 
                          DATA+LANG+"/dicc.src", DATA+LANG+"/np.dat",  
                          DATA+"common/punct.dat",DATA+LANG+"/corrector/corrector.dat");
        
        # crear analyzers
        self.tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat");
        self.sp=freeling.splitter(DATA+LANG+"/splitter.dat");
        self.mf=freeling.maco(op);
        
        self.tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",1,2);
        self.sen=freeling.senses(DATA+LANG+"/senses.dat");
        self.nec=freeling.nec(DATA+LANG+"/nerc/nec/nec-ab-rich.dat");
        # self.ner=freeling.nec(DATA+LANG+"/ner/ner-ab.dat");
        
        self.parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat");
        self.dep=freeling.dep_txala(DATA+LANG+"/dep/dependences.dat", self.parser.get_start_symbol());

        con_data={'user':'******','password':'******','host':'127.0.0.1', \
'database':'agiria','raise_on_warnings': True, 'autocommit':True, 'buffered':True}

        self.con = my.connect(**con_data)
Example #10
0
    def config_files(self, lang, data_dir, data_dir_common):

        data_dir += lang + "/"
        data_conf = data_dir + "nerc/nec/nec.cfg"

        opt = freeling.maco_options(lang)

        # (usr, pun, dic, aff, comp, loc, nps, qty, prb)
        opt.set_data_files("",
                           data_dir_common + "punct.dat",
                           data_dir + "dicc.src",
                           data_dir + "afixos.dat",
                           data_dir + "compounds.dat",
                           data_dir + "locucions.dat",
                           data_dir + "np.dat",
                           data_dir + "quantities.dat",
                           data_dir + "probabilitats.dat")

        self.mf = freeling.maco(opt)

        # (umap, num, pun, dat, dic, aff, comp, rtk, mw, ner, qt, prb)
        # (0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0)
        self.mf.set_active_options(False, True, True, True, False, True, True, True, True, True, True, True)

        self.tk = freeling.tokenizer(data_dir + "tokenizer.dat")
        self.sp = freeling.splitter(data_dir + "splitter.dat")

        self.tg = freeling.hmm_tagger(data_dir + "tagger.dat", True, 2)
        self.sen = freeling.senses(data_dir + "senses.dat")

        self.parser = freeling.chart_parser(data_dir + "chunker/grammar-chunk.dat")

        self.dep = freeling.dep_txala(data_dir + "/dep_txala/dependences.dat",
                                      self.parser.get_start_symbol())

        self.nec = freeling.nec(data_conf)
Example #11
0
    def __init__(self):
        print("Inicializando Nombres")
        print(str(datetime.time(datetime.now())))
        FREELINGDIR = "/usr/local"
        DATA = FREELINGDIR + "/share/freeling/"
        LANG = "es"
        freeling.util_init_locale("default")
        op = freeling.maco_options("es")
        op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)
        op.set_data_files("", DATA + LANG + "/locucions.dat",
                          DATA + LANG + "/quantities.dat",
                          DATA + LANG + "/afixos.dat",
                          DATA + LANG + "/probabilitats.dat",
                          DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat",
                          DATA + "common/punct.dat",
                          DATA + LANG + "/corrector/corrector.dat")

        # create analyzers
        self.tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        self.sp = freeling.splitter(DATA + LANG + "/splitter.dat")
        self.mf = freeling.maco(op)
        self.tg = freeling.hmm_tagger("es", DATA + LANG + "/tagger.dat", 1, 2)

        # self.sen=freeling.senses(DATA+LANG+"/senses.dat");
        # self.ner=freeling.ner(DATA+LANG+"/ner/ner-ab.dat");
        self.nec = freeling.nec(DATA + LANG + "/nec/nec-ab.dat")

        # self.parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat");

        self.pondera = {
            'titulo': 3,
            'intro': 2,
            'texto': 1
        }  #ponderacion dada a cada ner según tipo origen.
        self.indice = 0  #indice que representa la proporción de ners comunes sobre todos de una noticia
        self.con = my.connect(**con_data)
        self.ultimo = 0
        self.minimo_comun = 0.45  # porcentaje que tienen que compartir dos noticias para ser relacionadas
        self.minimo_palabras = 14  #mínimo numero de palabras (pnderadas) para poder entrar en relación
        # pasamos a list (no_incluir) las palabras que no deben ser consideradas NERs
        self.cur1 = self.con.cursor()
        texto = "Select nombre from no_nombres order by nombre"
        try:
            self.cur1.execute(texto)
        except my.Error as err:
            print("Error seleccionando nombres de tabla no_nombres",
                  format(err))
        self.no_incluir = [
        ]  # lista de palabras a omitir en ners identificados
        for nombre in self.cur1:
            try:
                nombre = str(nombre[0]).upper()
                nombre = nombre[2:-1]  # quitar simbolo de byte b'

                nombre = nombre.replace('\\XC3\\XA1', 'Á')
                nombre = nombre.replace('\\XC3\\X81', 'Á')
                nombre = nombre.replace('\\XC3\\XA9', 'É')
                nombre = nombre.replace('\\XC3\\XAD', 'Í')
                nombre = nombre.replace('\\XC3\\X8D', 'Í')
                nombre = nombre.replace('\\XC3\\XB3', 'Ó')
                nombre = nombre.replace('\\XC3\\X93', 'Ó')
                nombre = nombre.replace('\\XC3\\XBA', 'Ú')
                nombre = nombre.replace('\\XC3\\XBC', 'Ü')
                nombre = nombre.replace('\\XC3\\XB1', 'Ñ')
                nombre = nombre.replace('\\XC3\\X91', 'Ñ')
                nombre = nombre.replace('\\XC2\\XBA', 'º')
                nombre = nombre.replace('\\XC4\\X82\\XC4\\X84', 'ĂĄ')

                self.no_incluir.append(nombre)
            except:
                print("Error incluyendo no_nombres en lista")
        self.no_incluir = sorted(set(self.no_incluir))
        # corefs de tabla
        self.cur1 = self.con.cursor()
        texto = "Select original, coref from coref order by original"
        try:
            self.cur1.execute(texto)
        except my.Error as err:
            print("Error seleccionando corefs", format(err))
        self.corefs = {}  # dict de palabras corefs para cambiar en ners
        for original, coref in self.cur1:
            self.corefs[original] = coref

        print("Inicialización terminada", str(datetime.time(datetime.now())))
Example #12
0
app = Flask(__name__, static_url_path='/static')
freeling.util_init_locale("default")

# Create options set for maco analyzer. Default values are Ok, except for data files.
op = freeling.maco_options(LANG)
op.set_active_modules(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)
op.set_data_files("", DATA + LANG + "/locucions.dat",
                  DATA + LANG + "/quantities.dat", DATA + LANG + "/afixos.dat",
                  DATA + LANG + "/probabilitats.dat",
                  DATA + LANG + "/dicc.src", DATA + LANG + "/np.dat",
                  DATA + "common/punct.dat",
                  DATA + LANG + "/corrector/corrector.dat")

# create analyzers
tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
sp = freeling.splitter(DATA + LANG + "/splitter.dat")
mf = freeling.maco(op)


def decode_tag(tag):
    """
       Función para decodificar y extender las etiquetas
       generadas por Freeling en codificación EAGLES
    """
    categoria = tag[0]
    decoded = "Esta palabra pertenece a la categoría {} ".format(
        EAGLES_DICT[categoria]['Categoria'])

    atributos = tag[1:] if len(tag) > 1 else []
    aux = ''
# set locale to an UTF8 compatible locale
freeling.util_init_locale("default")

# get requested language from arg1, or English if not provided
lang = "es"

# get installation path to use from arg2, or use /usr/local if not provided
#ipath = "/usr/local/Cellar/freeling/4.0_4";
ipath = "/usr/local/Cellar/freeling/4.1_3"

# path to language data
lpath = ipath + "/share/freeling/" + lang + "/"

# create analyzers
tk = freeling.tokenizer(lpath + "tokenizer.dat")
sp = freeling.splitter(lpath + "splitter.dat")

# create the analyzer with the required set of maco_options
morfo = freeling.maco(my_maco_options(lang, lpath))
#  then, (de)activate required modules
morfo.set_active_options(
    False,  # UserMap
    True,  # NumbersDetection,
    True,  # PunctuationDetection,
    True,  # DatesDetection,
    True,  # DictionarySearch,
    True,  # AffixAnalysis,
    False,  # CompoundAnalysis,
    True,  # RetokContractions,
    True,  # MultiwordsDetection,
Example #14
0
    def __init__(self):

        lang = 'fr'
        ComplexityLanguage.__init__(self, lang)

        ## Modify this line to be your FreeLing installation directory
        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
        DATA = FREELINGDIR + "/data/"
        CLASSDIR = ""
        self.lang = lang
        freeling.util_init_locale("default")

        # create language analyzer
        self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat")

        # create options set for maco analyzer. Default values are Ok, except for data files.
        op = freeling.maco_options(lang)
        op.set_data_files(
            "", DATA + "common/punct.dat", DATA + lang + "/dicc.src",
            DATA + lang + "/afixos.dat", "", DATA + lang + "/locucions.dat",
            DATA + lang + "/np.dat", DATA + lang + "/quantities.dat",
            DATA + lang + "/probabilitats.dat")

        # create analyzers
        self.tk = freeling.tokenizer(DATA + lang + "/tokenizer.dat")
        self.sp = freeling.splitter(DATA + lang + "/splitter.dat")
        self.mf = freeling.maco(op)

        # activate mmorpho modules to be used in next call
        self.mf.set_active_options(
            False,
            True,
            True,
            True,  # select which among created 
            True,
            True,
            False,
            True,  # submodules are to be used. 
            True,
            True,
            True,
            True)  # default: all created submodules are used

        # create tagger and sense anotator
        self.tg = freeling.hmm_tagger(DATA + lang + "/tagger.dat", True, 2)
        self.sen = freeling.senses(DATA + lang + "/senses.dat")

        f = open(CLASSDIR + '/home/garciacumbreras18/DaleChall.txt')
        lines = f.readlines()
        f.close()

        listDaleChall = []
        for l in lines:
            data = l.strip().split()
            listDaleChall += data
        self.listDaleChall = listDaleChall
        """
        config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
        config = [
            True|False,         # KANDEL MODELS
            True|False,         # DALE CHALL
            True|False,         # SOL
            ]
        """
        self.config += [True, True, True]
        self.metricsStr.extend(['KANDEL-MODELS', 'DALE CHALL', 'SOL'])

        self.configExtend += [True, True]
        self.metricsStrExtend.extend(['MEAN RARE WORDS', 'STD RARE WORDS'])
import freeling
# code extracted from https://gist.github.com/arademaker/dffb8de093502b153e85#file-processing-py-L50
FREELINGDIR = '/usr/local'
DATA = FREELINGDIR + '/share/freeling/'
LANGUAGE = 'en'

freeling.util_init_locale('default')
option = freeling.maco_options(LANGUAGE)
option.set_data_files( "",
                       DATA + "common/punct.dat",
                       DATA + LANGUAGE + "/dicc.src",
                       DATA + LANGUAGE + "/afixos.dat",
                       "",
                       DATA + LANGUAGE + "/locucions.dat",
                       DATA + LANGUAGE + "/np.dat",
                       DATA + LANGUAGE + "/quantities.dat",
                       DATA + LANGUAGE + "/probabilitats.dat")
morfo = freeling.maco(option)
tokenizer = freeling.tokenizer(DATA + LANGUAGE + '/tokenizer.dat')
splitter = freeling.splitter(DATA + LANGUAGE + '/splitter.dat')
sid = splitter.open_session()
tagger = freeling.hmm_tagger(DATA + LANGUAGE + '/tagger.dat', True, 2)
parser = freeling.chart_parser(DATA + LANGUAGE + '/chunker/grammar-chunk.dat')
morfo.set_active_options(False, True, True, True,
                         True, True, False, True,
                         True, True, True, True )
Example #16
0
import freeling
# code extracted from https://gist.github.com/arademaker/dffb8de093502b153e85#file-processing-py-L50
FREELINGDIR = '/usr/local'
DATA = FREELINGDIR + '/share/freeling/'
LANGUAGE = 'en'

freeling.util_init_locale('default')
option = freeling.maco_options(LANGUAGE)
option.set_data_files(
    "", DATA + "common/punct.dat", DATA + LANGUAGE + "/dicc.src",
    DATA + LANGUAGE + "/afixos.dat", "", DATA + LANGUAGE + "/locucions.dat",
    DATA + LANGUAGE + "/np.dat", DATA + LANGUAGE + "/quantities.dat",
    DATA + LANGUAGE + "/probabilitats.dat")
morfo = freeling.maco(option)
tokenizer = freeling.tokenizer(DATA + LANGUAGE + '/tokenizer.dat')
splitter = freeling.splitter(DATA + LANGUAGE + '/splitter.dat')
sid = splitter.open_session()
tagger = freeling.hmm_tagger(DATA + LANGUAGE + '/tagger.dat', True, 2)
parser = freeling.chart_parser(DATA + LANGUAGE + '/chunker/grammar-chunk.dat')
morfo.set_active_options(False, True, True, True, True, True, False, True,
                         True, True, True, True)
Example #17
0
    def __init__(self, lang='it'):

        ## Modify this line to be your FreeLing installation directory
        FREELINGDIR = "/home/garciacumbreras18/dist/freeling"
        DATA = FREELINGDIR + "/data/"

        self.DATA = DATA
        self.lang = lang
        freeling.util_init_locale("default")

        # create language analyzer
        self.la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat")

        # create options set for maco analyzer. Default values are Ok, except for data files.
        op = freeling.maco_options(lang)
        op.set_data_files("", self.DATA + "common/punct.dat",
                          self.DATA + self.lang + "/dicc.src",
                          self.DATA + self.lang + "/afixos.dat", "",
                          self.DATA + self.lang + "/locucions.dat",
                          self.DATA + self.lang + "/np.dat", "",
                          self.DATA + self.lang + "/probabilitats.dat")

        # create analyzers
        self.tk = freeling.tokenizer(self.DATA + self.lang + "/tokenizer.dat")
        self.sp = freeling.splitter(self.DATA + self.lang + "/splitter.dat")
        self.mf = freeling.maco(op)

        # activate mmorpho modules to be used in next call
        self.mf.set_active_options(
            False,
            True,
            True,
            True,  # select which among created 
            True,
            True,
            False,
            True,  # submodules are to be used. 
            True,
            True,
            True,
            True)  # default: all created submodules are used
        # create tagger
        self.tg = freeling.hmm_tagger(self.DATA + self.lang + "/tagger.dat",
                                      True, 2)
        self.sen = freeling.senses(DATA + lang + "/senses.dat")
        """ 
        config es una lista de valores booleanos que activa o desactivan el cálculo de una medida
        config = [
            True|False,         # PUNCTUATION MARKS
            True|False,         # SCI
            True|False,         # ARI 
            True|False,         # MU
            True|False,         # Flesch-Vaca
            True|False,         # Gulpease
            ]
         Si config == None se calculan todas las métricas de complejidad soportadas
        """
        self.config = [True, True, True, True, True, True]
        self.metricsIt = [
            'AVERAGE PUNCTUATION MARKS', 'SCI', 'ARI', 'MU', 'FLESCH-VACA',
            'GULPEASE'
        ]

        self.configExtend = [True, True, True, True, True]
        self.metricsItExtend = [
            'MEAN WORDS', 'STD WORDS', 'COMPLEX SENTENCES', 'MEAN SYLLABLES',
            'STD SYLLABLES'
        ]
    def __init__(self, text):
        super().__init__(text)
        self.stop_words = set(stopwords.words('spanish') + list(punctuation))
        self._cleaned_text = list()
        freeling.util_init_locale("default")

        # create language analyzer
        la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat")

        # create options set for maco analyzer. Default values are Ok, except for data files.
        op = freeling.maco_options("es")
        op.set_data_files(
            "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src",
            DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat",
            DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat",
            DATA + LANG + "/probabilitats.dat")

        # create analyzers
        tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        sp = freeling.splitter(DATA + LANG + "/splitter.dat")
        sid = sp.open_session()
        mf = freeling.maco(op)

        # activate mmorpho odules to be used in next call
        mf.set_active_options(
            True,
            True,
            True,
            True,  # select which among created
            True,
            True,
            True,
            True,  # submodules are to be used.
            True,
            True,
            True,
            True)  # default: all created submodules are used

        # create tagger, sense anotator, and parsers
        tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
        sen = freeling.senses(DATA + LANG + "/senses.dat")
        parser = freeling.chart_parser(DATA + LANG +
                                       "/chunker/grammar-chunk.dat")

        l = tk.tokenize(self.text)
        ls = sp.split(sid, l, False)

        ls = mf.analyze(ls)
        ls = tg.analyze(ls)
        ls = sen.analyze(ls)
        ls = parser.analyze(ls)

        for s in ls:
            ws = s.get_words()
            for w in ws:
                # Removing all stopped words, including prepositions, conjunctions, interjections and punctuation
                tag = w.get_tag()
                word = w.get_form()
                if tag.startswith("S") or \
                    tag.startswith("I") or \
                    tag.startswith("C") or \
                    tag.startswith("F") or \
                    tag.startswith("D") or \
                    tag.startswith("P"):
                    pass
                else:
                    self._cleaned_text.append("{}-{}".format(word, tag))
Example #19
0
 def __init__(self):
     print ("Inicializando Nombres")
     print(str(datetime.time(datetime.now())))
     FREELINGDIR = "/usr/local";
     DATA = FREELINGDIR+"/share/freeling/";
     LANG="es";
     freeling.util_init_locale("default");
     op= freeling.maco_options("es");
     op.set_active_modules(0,1,1,1,1,1,1,1,1,1,0);
     op.set_data_files("",DATA+LANG+"/locucions.dat", DATA+LANG+"/quantities.dat", 
                       DATA+LANG+"/afixos.dat", DATA+LANG+"/probabilitats.dat", 
                       DATA+LANG+"/dicc.src", DATA+LANG+"/np.dat",  
                       DATA+"common/punct.dat",DATA+LANG+"/corrector/corrector.dat");
     
     # create analyzers
     self.tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat");
     self.sp=freeling.splitter(DATA+LANG+"/splitter.dat");
     self.mf=freeling.maco(op);
     self.tg=freeling.hmm_tagger("es",DATA+LANG+"/tagger.dat",1,2);
     
     # self.sen=freeling.senses(DATA+LANG+"/senses.dat");
     # self.ner=freeling.ner(DATA+LANG+"/ner/ner-ab.dat");
     self.nec=freeling.nec(DATA+LANG+"/nec/nec-ab.dat");
     
     # self.parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat");
     
     self.pondera ={'titulo':3,'intro':2, 'texto':1} #ponderacion dada a cada ner según tipo origen.
     self.indice = 0 #indice que representa la proporción de ners comunes sobre todos de una noticia
     self.con = my.connect(**con_data)
     self.ultimo = 0
     self.minimo_comun = 0.45 # porcentaje que tienen que compartir dos noticias para ser relacionadas
     self.minimo_palabras = 14 #mínimo numero de palabras (pnderadas) para poder entrar en relación
     # pasamos a list (no_incluir) las palabras que no deben ser consideradas NERs
     self.cur1 = self.con.cursor()
     texto = "Select nombre from no_nombres order by nombre";
     try:
         self.cur1.execute(texto)
     except my.Error as err:
         print("Error seleccionando nombres de tabla no_nombres", format(err))
     self.no_incluir = [] # lista de palabras a omitir en ners identificados
     for nombre in self.cur1:
         try:
             nombre = str(nombre[0]).upper()
             nombre = nombre[2:-1] # quitar simbolo de byte b'
             
             nombre = nombre.replace('\\XC3\\XA1', 'Á')
             nombre = nombre.replace('\\XC3\\X81', 'Á')
             nombre = nombre.replace('\\XC3\\XA9', 'É')
             nombre = nombre.replace('\\XC3\\XAD', 'Í')
             nombre = nombre.replace('\\XC3\\X8D', 'Í')
             nombre = nombre.replace('\\XC3\\XB3', 'Ó')
             nombre = nombre.replace('\\XC3\\X93', 'Ó')
             nombre = nombre.replace('\\XC3\\XBA', 'Ú')
             nombre = nombre.replace('\\XC3\\XBC', 'Ü')
             nombre = nombre.replace('\\XC3\\XB1', 'Ñ')
             nombre = nombre.replace('\\XC3\\X91', 'Ñ')
             nombre = nombre.replace('\\XC2\\XBA', 'º')
             nombre = nombre.replace('\\XC4\\X82\\XC4\\X84', 'ĂĄ')
             
             self.no_incluir.append(nombre)
         except:
             print("Error incluyendo no_nombres en lista")
     self.no_incluir= sorted(set(self.no_incluir))
     # corefs de tabla
     self.cur1 = self.con.cursor()
     texto = "Select original, coref from coref order by original";
     try:
         self.cur1.execute(texto)
     except my.Error as err:
         print("Error seleccionando corefs", format(err))
     self.corefs = {} # dict de palabras corefs para cambiar en ners   
     for original,coref in self.cur1:  
         self.corefs[original] = coref
         
     print("Inicialización terminada", str(datetime.time(datetime.now())))
Example #20
0
def build_freeling(lang):
    ##### Build resources
    FREELINGDIR = "/usr/local"

    DATA = FREELINGDIR + "/share/freeling/"
    LANG_ES = "es"
    LANG_EN = "en"

    freeling.util_init_locale("default")

    if lang == 'es':
        ##### Build Spanish analyzers
        op = freeling.maco_options("es")
        op.set_data_files(
            "",
            DATA + "common/punct.dat",
            DATA + "es" + "/dicc.src",
            DATA + "es" + "/afixos.dat",
            "",
            # "data/locutions_es_processed.dat",
            "",
            DATA + "es" + "/np.dat",
            DATA + "es" + "/quantities.dat",
            DATA + "es" + "/probabilitats.dat")
        # create analyzers
        op.MultiwordsDetection = True
        tk = freeling.tokenizer(DATA + "es" + "/tokenizer.dat")
        sp = freeling.splitter(DATA + "es" + "/splitter.dat")
        sid = sp.open_session()
        mf = freeling.maco(op)
        # activate mmorpho odules to be used in next call
        mf.set_active_options(
            False,
            True,
            True,
            True,  # select which among created 
            True,
            True,
            False,
            True,  # submodules are to be used. 
            True,
            True,
            True,
            True)  # default: all created submodules are used
        # create tagger
        tg = freeling.hmm_tagger(DATA + "es" + "/tagger.dat", True, 2)

    elif lang == 'en':
        ##### Build English analyzers
        op = freeling.maco_options("en")
        op.set_data_files(
            "",
            DATA + "common/punct.dat",
            DATA + "en" + "/dicc.src",
            DATA + "en" + "/afixos.dat",
            "",
            # "data/locutions_en_processed.dat",
            "",
            DATA + "en" + "/np.dat",
            DATA + "en" + "/quantities.dat",
            DATA + "en" + "/probabilitats.dat")
        # create analyzers
        tk = freeling.tokenizer(DATA + "en" + "/tokenizer.dat")
        sp = freeling.splitter(DATA + "en" + "/splitter.dat")
        sid = sp.open_session()
        mf = freeling.maco(op)
        # activate mmorpho odules to be used in next call
        mf.set_active_options(
            False,
            True,
            True,
            True,  # select which among created 
            True,
            True,
            False,
            True,  # submodules are to be used. 
            True,
            True,
            True,
            True)  # default: all created submodules are used
        # create tagger
        tg = freeling.hmm_tagger(DATA + "en" + "/tagger.dat", True, 2)

    return tk, sp, sid, mf, tg
Example #21
0
    def fullParsing(self, text, sentimentText):

        ## Modify this line to be your FreeLing installation directory
        FREELINGDIR = "/usr/local"

        DATA = FREELINGDIR + "/share/freeling/"
        LANG = "es"

        freeling.util_init_locale("default")

        # create language analyzer
        la = freeling.lang_ident(DATA + "common/lang_ident/ident.dat")

        # create options set for maco analyzer. Default values are Ok, except for data files.
        op = freeling.maco_options("es")
        op.set_data_files(
            "", DATA + "common/punct.dat", DATA + LANG + "/dicc.src",
            DATA + LANG + "/afixos.dat", "", DATA + LANG + "/locucions.dat",
            DATA + LANG + "/np.dat", DATA + LANG + "/quantities.dat",
            DATA + LANG + "/probabilitats.dat")

        # create analyzers
        tk = freeling.tokenizer(DATA + LANG + "/tokenizer.dat")
        sp = freeling.splitter(DATA + LANG + "/splitter.dat")
        sid = sp.open_session()
        mf = freeling.maco(op)

        # activate mmorpho odules to be used in next call
        mf.set_active_options(
            False,
            True,
            True,
            True,  # select which among created 
            True,
            True,
            False,
            True,  # submodules are to be used. 
            True,
            True,
            True,
            True)
        # default: all created submodules are used

        # create tagger, sense anotator, and parsers
        tg = freeling.hmm_tagger(DATA + LANG + "/tagger.dat", True, 2)
        sen = freeling.senses(DATA + LANG + "/senses.dat")
        parser = freeling.chart_parser(DATA + LANG +
                                       "/chunker/grammar-chunk.dat")
        dep = freeling.dep_txala(DATA + LANG + "/dep_txala/dependences.dat",
                                 parser.get_start_symbol())

        #split Target as a list
        #print(sentimentText)
        sentimentText += '.'
        if sentimentText[0] == '@':
            sentimentText = sentimentText[1:]
        target = tk.tokenize(sentimentText)
        targets = sp.split(sid, target, True)

        targets = mf.analyze(targets)
        targets = parser.analyze(targets)
        targets = dep.analyze(targets)

        for s in targets:
            targetr = s.get_parse_tree()
            targetList = self.getTreeAsList(targetr, 0)
            del targetList[-1]
        #print(targetList)

        # process input text
        lin = text
        if lin[0] == '@':
            lin = lin[1:]

        #while (lin) :

        l = tk.tokenize(lin)
        ls = sp.split(sid, l, True)

        ls = mf.analyze(ls)
        ls = parser.analyze(ls)
        ls = dep.analyze(ls)

        finalType = None
        finalList = None

        ## output results
        for s in ls:
            tr = s.get_parse_tree()
            #self.printTree(tr, 0);
            wordType, wordList = self.getTypeNode(tr, 0, targetList)
            if finalType is None:
                if wordType is not None:
                    finalType = wordType
                    finalList = wordList
        # clean up
        sp.close_session(sid)

        return finalType, finalList
Example #22
0
LANG="es";

freeling.util_init_locale("default");

op= freeling.maco_options("es");
op.set_data_files( "",
                   DATA + "common/punct.dat",
                   DATA + LANG + "/dicc.src",
                   DATA + LANG + "/afixos.dat",
                   "",
                   DATA + LANG + "/locucions.dat", 
                   DATA + LANG + "/np.dat",
                   DATA + LANG + "/quantities.dat",
                   DATA + LANG + "/probabilitats.dat");

tk=freeling.tokenizer(DATA+LANG+"/tokenizer.dat");
sp=freeling.splitter(DATA+LANG+"/splitter.dat");
sid=sp.open_session();
mf=freeling.maco(op);

mf.set_active_options(False, False, True, False,
                      True, True, False, True,
                      False, True, False, True )

tg=freeling.hmm_tagger(DATA+LANG+"/tagger.dat",True,2)
sen=freeling.senses(DATA+LANG+"/senses.dat")
parser= freeling.chart_parser(DATA+LANG+"/chunker/grammar-chunk.dat")
dep=freeling.dep_txala(DATA+LANG+"/dep_txala/dependences.dat", parser.get_start_symbol())


process_file(input_training_file, output_training_file, [sid, tk, sp, mf, tg, sen, parser, dep])
Example #23
0
    def tag(self):

        try:
            styles = self._styles.get()
            ppf = self._ppf.get()
            if self._only_completes.get() == 1:
                only_completes = True
            else:
                only_completes = False
            if self._webanno.get() == 1:
                webanno = True
            else:
                webanno = False
        except:
            messagebox.showerror(
                title="Ungültige Eingabe",
                message=
                """Bitte überprüfe, dass es sich bei deiner Eingabe in "Anzahl Sätze pro Datei" um eine ganzzahlige Zahl handelt."""
            )
            return None

        self._info.set("Starting...")
        self.root.update()

        # headers for the tsv
        if webanno:
            metadata_header = "webanno.custom.Metadata | Metadatavalue"
            lemma_header = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma | value"
            pos_header = "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS | PosValue"
            new_pos_header = "webanno.custom.NewPOS | SavePOSValue"
            morpho_header = "webanno.custom.Morpho | MorphoValue"
            comment_header = "webanno.custom.Comments | Commentvalue"
            dep_header = "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency | DependencyType | AttachTo=de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS"
            hashtag = " # "

        # this needs to point to the freeling install directory
        FREELINGDIR = "/usr/local"
        DATA = FREELINGDIR + "/share/freeling/"
        LANG = "es"
        PATH = DATA + LANG + "/"

        freeling.util_init_locale("default")

        # create tokenizer and splitter
        tk = freeling.tokenizer(PATH + "tokenizer.dat")
        sp = freeling.splitter("RoSeData/no_splitter.dat")
        # a splitter is necessary for the process,
        sid = sp.open_session()
        # but our data is already split. no_splitter.dat tells the splitter to never split

        # create options set for maco analyzer. Default values are Ok, except for data files.
        op = freeling.maco_options("es")
        op.UserMapFile = ""
        op.LocutionsFile = PATH + "locucions.dat"
        op.AffixFile = PATH + "afixos.dat"
        op.ProbabilityFile = PATH + "probabilitats.dat"
        op.DictionaryFile = PATH + "dicc.src"
        op.NPdataFile = PATH + "np.dat"
        op.PunctuationFile = PATH + "../common/punct.dat"

        mf = freeling.maco(op)

        # activate morpho modules to be used in next call
        mf.set_active_options(
            False,
            True,
            True,
            True,  # select which among created 
            True,
            True,
            False,
            True,  # submodules are to be used. 
            True,
            True,
            False,
            True)
        # default: all created submodules are used

        # create tagger
        self._info.set("Generiere Tagger...")
        self.root.update()
        tg = freeling.hmm_tagger(PATH + "tagger.dat", True, 2)

        # create sense annotator and disambiguator
        self._info.set("Generiere sense disambiguator...")
        self.root.update()
        sen = freeling.senses(PATH + "senses.dat")
        wsd = freeling.ukb(PATH + "ukb.dat")

        # create parser
        self._info.set("Generiere dependency parser...")
        self.root.update()
        parser = freeling.dep_treeler(PATH + "dep_treeler/dependences.dat")

        # keep track of how many sentences were counted
        sent_counter = 0

        # keep track of documents created
        doc_counter = 0

        webanno_sent_counter = 0

        outputter = freeling.output_conll()

        # Write headers
        outf = open("output/" + self._outF.get() + ".xml",
                    encoding='utf-8',
                    mode='w')
        outf.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
        outf.write("<corpus>\n")
        # Start Tagging Process
        try:
            iterate_docs = ET.iterparse(self._indir.get(),
                                        events=("end", ),
                                        tag="document")
        except:
            messagebox.showerror(
                title="Ungültiger Dateipfad",
                message=
                "Unter dem angegebenen Dateipfad konnte keine XMl-Datei gefunden werden."
            )
            self._info.set("Process stopped.")
            self.root.update()
            return None
        for action, doc in iterate_docs:  # iterate all fileElems
            if True:  # filter in case you only want certain docs
                self._info.set("Dokument {} wird bearbeitet...".format(
                    doc.attrib["file"]))
                self.root.update()
                # filter out all unwanted phrases
                if styles == 'all' and only_completes == True:
                    phrases = doc.xpath('phrase[contains(@complete,"yes")]')
                elif styles == 'all' and only_completes == True:
                    phrases = doc.xpath('phrase')
                elif styles != 'all' and only_completes == True:
                    phrases = doc.xpath(
                        'phrase[contains(@complete,"yes") and contains(@style,"'
                        + styles + '")]')
                else:
                    phrases = doc.xpath('phrase[contains(@style,"' + styles +
                                        '")]')
                for phrase in phrases:
                    phrasetext = phrase.text
                    lw = tk.tokenize(phrasetext)
                    ls = sp.split(sid, lw, True)
                    ls = mf.analyze(ls)
                    ls = tg.analyze(ls)
                    ls = sen.analyze(ls)
                    wsdis = wsd.analyze(ls)
                    dep = parser.analyze(wsdis)
                    if webanno:
                        # open a new tsv file if number of phrases is reached
                        if sent_counter % ppf == 0:
                            if doc_counter != 0:
                                conllout.close()
                            doc_counter += 1
                            conllout = open(self._outF.get() + '-' +
                                            str(doc_counter) + '.tsv',
                                            encoding='utf-8',
                                            mode='w')
                            tsvwriter = csv.writer(conllout, delimiter='\t')
                            # implement headers
                            tsvwriter.writerow([
                                hashtag + metadata_header + hashtag +
                                lemma_header + hashtag + pos_header + hashtag +
                                new_pos_header + hashtag + morpho_header +
                                hashtag + comment_header + hashtag + dep_header
                            ])
                            webanno_sent_counter = 0
                        if webanno_sent_counter != 0:
                            tsvwriter.writerow([])
                        tsvwriter.writerow(
                            ["#id=" + str(webanno_sent_counter)])
                    word_counter = 1
                    sent_counter += 1
                    self._info2.set(
                        str(sent_counter) + " Sätze wurden analysiert!")
                    self.root.update()
                    conllstr = outputter.PrintResults(dep)
                    tokens_in_sent = conllstr.splitlines()

                    # a clunky way to get the treedata
                    depdict = {}
                    for token in tokens_in_sent:
                        if len(token) > 1:
                            elements = token.split()
                            depdict[elements[0]] = [
                                elements[1], elements[9], elements[10]
                            ]

                    for sentence in ls:
                        sent_all_info = []  #only needed for the AfterFilter

                        for word in sentence.get_words():
                            dictentry = depdict[str(word_counter)]
                            if dictentry[0] != word.get_form():
                                print(
                                    "An error occured! Please check this phrase:",
                                    phrasetext)
                            if dictentry[1] == "0":
                                dictentry[1] = str(word_counter)
                            # we give the metadata to the phrase by storing it as a layer in the first token
                            if word_counter == 1:
                                doc = phrase.getparent()
                                docname = doc.attrib["file"]
                                webanno_metadata = os.path.basename(
                                    self._indir.get()
                                ) + ", " + docname + ", " + phrase.attrib["id"]
                            else:
                                webanno_metadata = "_"
                            tokenElem = ET.SubElement(phrase,
                                                      'token',
                                                      id=str(word_counter),
                                                      lemma=word.get_lemma(),
                                                      pos=word.get_tag(),
                                                      dep_tag=dictentry[2],
                                                      dep_parent=dictentry[1])
                            tokenElem.text = word.get_form()
                            if webanno:
                                #save all info as a tuple similar to webanno/conll-Format
                                all_info = (word.get_form(), webanno_metadata,
                                            word.get_lemma(), word.get_tag(),
                                            dictentry[2], dictentry[1])
                                sent_all_info.append(all_info)
                            word_counter += 1

                        if webanno:
                            allowed = self._AfterFilter(
                                sent_all_info)  #filter the phrases
                            if allowed:
                                webanno_sent_counter += 1
                                this_word_counter = 1
                                # finally write the phrases to the tsv
                                for element in sent_all_info:
                                    tsvwriter.writerow([
                                        str(webanno_sent_counter) + "-" +
                                        str(this_word_counter), element[0],
                                        element[1], element[2], element[3],
                                        "_", "_", "O", element[4],
                                        str(webanno_sent_counter) + "-" +
                                        element[5]
                                    ])
                                    this_word_counter += 1
                # write docElem
                docString = ET.tostring(doc,
                                        encoding='unicode',
                                        pretty_print=True)
                outf.write(docString)
            doc.clear()
            # Also eliminate now-empty references from the root node to elem
            for ancestor in doc.xpath('ancestor-or-self::*'):
                while ancestor.getprevious() is not None:
                    del ancestor.getparent()[0]
            doc.getparent().remove(doc)

        outf.write("</corpus>")
        outf.close()
        del iterate_docs

        if webanno:
            conllout.close()

        sp.close_session(sid)

        self._info.set("Tagging erfolgreich beendet.")
        self.root.update()