コード例 #1
0
    def eval(self, raw_text_file, gold_conllu_file):
        input_string = ""
        useSpaces = " "  # True
        lines = []

        with fopen(raw_text_file, "r") as file:
            lines = file.readlines()
            
        # analyze use of spaces in first part of the file
        test = "";
        cnt = 0
        while True:
            test = test + lines[cnt]
            # print(lines[cnt])
            cnt += 1
            if cnt >= len(lines) or cnt > 5:
                break

        if float(test.count(' ')) / float(len(test)) < 0.02:
            useSpaces = ""
        # print (str(float(test.count(' '))/float(len(test))))

        i = -1
        input_string = ""
        sentences = []
        while i < len(lines) - 1:
            i += 1
            input_string = input_string + lines[i].replace("\r", "").replace("\n", "").strip() + useSpaces
            if lines[i].strip() == "" or i == len(lines) - 1:  # end of block
                if input_string.strip() != "":
                    sentences += self.tokenizer.tokenize(input_string)
                input_string = ""

        with fopen(self.tokenizer.config.base + "-temporary.conllu", 'w') as file:
            for sentence in sentences:
                # print ("Sentence has entries: "+str(len(sentence)))
                for entry in sentence:
                    line = str(
                        entry.index) + "\t" + entry.word + "\t" + entry.lemma + "\t" + entry.upos + "\t" + entry.xpos + "\t" + entry.attrs + "\t" + str(
                        entry.head) + "\t" + entry.label + "\t" + entry.deps + "\t" + entry.space_after + "\n"
                    file.write(line)

                file.write("\n")

        # run eval script
        metrics = conll_eval(self.tokenizer.config.base + "-temporary.conllu", gold_conllu_file)

        return metrics["Tokens"].f1 * 100., metrics["Sentences"].f1 * 100.
コード例 #2
0
def parse_test(params):
    if params.test == "parser":
        print ("Running " + params.test)
        print ("==PARAMETERS==")
        print ("EMBEDDINGS: " + params.embeddings)
        print ("MODEL FILE: " + params.model_base)
        print ("DECODER: " + params.decoder)
        print ("OUTPUT: " + params.output_file)
        print ("CONFIG FILE: " + str(params.config))
        print ("==============\n")

        testset = Dataset(params.test_file)
        encodings = Encodings()
        encodings.load(params.model_base + ".encodings")
        encodings.update_wordlist(testset)
        print ("Updated word list: " + str(len(encodings.word_list)))
        config = ParserConfig(filename=params.config)
        embeddings = WordEmbeddings()
        embeddings.read_from_file(params.embeddings, encodings.word_list)
        parser = BDRNNParser(config, encodings, embeddings)
        parser.load(params.model_base + ".bestUAS")
        if params.decoder == 'mst':
            print ("!!!!!!!!!!!!!!!!!!!!!!!!!USING MST DECODER")
            from graph.decoders import MSTDecoder
            parser.decoder = MSTDecoder()
        f = fopen(params.output_file, "w")
        last_proc = 0
        index = 0
        for seq in testset.sequences:
            index += 1
            proc = index * 100 / len(testset.sequences)
            if proc % 5 == 0 and proc != last_proc:
                last_proc = proc
                sys.stdout.write(" " + str(proc))
                sys.stdout.flush()

            rez = parser.tag(seq)
            iSeq = 0
            iRez = 0
            while iSeq < len(seq):
                while seq[iSeq].is_compound_entry:
                    iSeq += 1
                seq[iSeq].xpos = rez[iRez].xpos
                seq[iSeq].upos = rez[iRez].upos
                seq[iSeq].attrs = rez[iRez].attrs
                seq[iSeq].head = rez[iRez].head
                seq[iSeq].label = rez[iRez].label
                seq[iSeq].lemma = rez[iRez].lemma
                iSeq += 1
                iRez += 1

            for entry in seq:
                f.write(str(entry.index) + "\t" + str(entry.word) + "\t" + str(entry.lemma) + "\t" + str(
                    entry.upos) + "\t" + str(entry.xpos) + "\t" + str(entry.attrs) + "\t" + str(
                    entry.head) + "\t" + str(entry.label) + "\t" + str(entry.deps) + "\t" + str(
                    entry.space_after) + "\n")
            f.write("\n")

        f.close()
        sys.stdout.write("\n")
コード例 #3
0
ファイル: encodings.py プロジェクト: silviupanaite/NLP-Cube
    def save(self, filename):    
        f = fopen(filename, "w")
        f.write("LABELS " + str(len(self.label2int)) + "\n")
        for label in self.label2int:
            f.write(str(label) + "\t" + str(self.label2int[label]) + "\n")
        f.write("CHARACTERS " + str(len(self.char2int)) + "\n")
        for character in self.char2int:
            if sys.version_info[0] == 2:
                f.write(character.encode('utf-8') + "\t" + str(self.char2int[character]) + "\n")
            else:
                f.write(character + "\t" + str(self.char2int[character]) + "\n")
        f.write("WORDS " + str(len(self.word2int)) + "\n")
        for word in self.word2int:
            if sys.version_info[0] == 2:
                f.write(word.encode('utf-8') + "\t" + str(self.word2int[word]) + "\n")
            else:
                f.write(word + "\t" + str(self.word2int[word]) + "\n")

        f.write("UPOS " + str(len(self.upos2int)) + "\n")
        for label in self.upos2int:
            f.write(label + "\t" + str(self.upos2int[label]) + "\n")
        f.write("XPOS " + str(len(self.xpos2int)) + "\n")
        for label in self.xpos2int:
            f.write(label + "\t" + str(self.xpos2int[label]) + "\n")
        f.write("ATTRS " + str(len(self.attrs2int)) + "\n")
        for label in self.attrs2int:
            f.write(label + "\t" + str(self.attrs2int[label]) + "\n")
        f.close()
コード例 #4
0
 def write(self, filename):
     with fopen(filename, 'w') as file:
         for sequence in self.sequences:
             for entry in sequence:
                 file.write(str(entry.index))
                 file.write("\t")
                 if isinstance(entry.word, str):
                     file.write(entry.word)
                 else:
                     file.write(entry.word.encode('utf-8'))
                 file.write("\t")
                 if isinstance(entry.lemma, str):
                     file.write(entry.lemma)
                 else:
                     file.write(entry.lemma.encode('utf-8'))
                 file.write("\t")
                 file.write(entry.upos)
                 file.write("\t")
                 file.write(entry.xpos)
                 file.write("\t")
                 file.write(entry.attrs)
                 file.write("\t")
                 file.write(str(entry.head))
                 file.write("\t")
                 file.write(entry.label)
                 file.write("\t")
                 file.write(entry.deps)
                 file.write("\t")
                 file.write(entry.space_after)
                 file.write("\n")
             file.write("\n")
コード例 #5
0
    def version_to_donwload(self, lang_code, check_for_latest=True):
        """
        Returns the version of the language models that need to be downloaded,
        or None if there's nothing to be done.
        """
        lang_models = os.path.join(self.disk_path, lang_code)
        lang_models_version = os.path.join(lang_models, 'VERSION')

        # Get current version (if any).
        current_version = None
        if os.path.exists(lang_models):
            with fopen(lang_models_version) as fd:
                current_version = fd.read().strip('\n')

        # Get the latest version.
        latest_versions = self.get_latest_model_versions()
        latest_version = latest_versions.get(lang_code)

        if check_for_latest:
            if not latest_version:
                if not current_version:
                    raise ValueError(
                        'No remote version found for {}!'.format(lang_code))

                print('No remote version found for {}, using the local '
                      'version {}'.format(lang_code, current_version))
                return

            if current_version and current_version >= latest_version:
                return

            return latest_version

        if not current_version:
            return latest_version
コード例 #6
0
    def __init__(self, file=None):
        if file is not None:
            sys.stdout.write("Reading " + file + "... ")
            sys.stdout.flush()
            with fopen(file, "r") as f:
                lines = f.readlines()

            self.sequences = self._make_sequences(lines)
            sys.stdout.write("found " + str(len(self.sequences)) +
                             " sequences\n")
コード例 #7
0
    def _download_facebook_embeddings(self, lang_code):
        """
        Download Facebook embeddings for the provided lang_code.
        """
        name = self.EMBEDDINGS_NAME.format(lang_code)
        embeddings_url = self.FACEBOOK_EMBEDDINGS_URL + name
        embeddings_path = os.path.join(self.disk_path, lang_code, name)

        request = requests.get(embeddings_url)
        with fopen(embeddings_path, 'wb') as fd:
            fd.write(request.content)
コード例 #8
0
ファイル: embeddings.py プロジェクト: silviupanaite/NLP-Cube
    def read_from_file(self, word_embeddings_file, word_list, full_load=False):
        self.word2vec = {}
        self.num_embeddings = 0
        if word_list is None and not full_load:
            self.cache_only = True
        f = fopen(word_embeddings_file, "r")
        first_line = True
        while True:
            ofs = f.tell()
            line = f.readline()
            if line == '':
                break
                # print ofs
            line = line.replace("\n", "").replace("\r", "")
            if first_line:
                first_line = False
            else:
                self.num_embeddings += 1
                if self.verbose:
                    if self.num_embeddings % 10000 == 0:
                        sys.stdout.write("  Scanned " +
                                         str(self.num_embeddings) +
                                         " word embeddings and added " +
                                         str(len(self.word2vec)) + "  \n")
                parts = line.split(" ")
                if sys.version_info[0] == 2:
                    word = parts[0].decode('utf-8')
                else:
                    word = parts[0]
                if self.cache_only:
                    self.word2ofs[word] = ofs
                elif full_load or word in word_list:
                    embeddings = [float(0)] * (len(parts) - 2)

                    for zz in range(len(parts) - 2):
                        embeddings[zz] = float(parts[zz + 1])
                    self.word2vec[word] = embeddings
                self.word_embeddings_size = len(parts) - 2
        f.close()
        if self.cache_only:
            self.file_pointer = fopen(word_embeddings_file, "r")
コード例 #9
0
ファイル: model_store.py プロジェクト: silviupanaite/NLP-Cube
 def save(self, filename):
     assert (filename.endswith("metadata.json"))
     obj = {}
     obj["language"] = self.language
     obj["language_code"] = self.language_code
     obj["model_version"] = self.model_version
     obj["embeddings_remote_link"] = self.embeddings_remote_link
     obj["embeddings_file_name"] = self.embeddings_file_name
     obj["token_delimiter"] = self.token_delimiter
     obj["model_build_date"] = self.model_build_date
     obj["model_build_source"] = self.model_build_source
     obj["notes"] = self.notes
     json.dump(obj, fopen(filename, "w"), indent=4, sort_keys=True)
コード例 #10
0
ファイル: model_store.py プロジェクト: silviupanaite/NLP-Cube
 def read(self, filename):
     assert (filename.endswith("metadata.json"))
     data = json.load(fopen(filename, "r"))
     if sys.version_info[0] == 2:
         items = data.iteritems()
     else:
         items = data.items()
     for key, value in items:
         if key == "model_version":  # safety check to keep the version as float
             if isinstance(value, str):
                 self.__dict__[key] = float(value)
         else:
             self.__dict__[key] = value
コード例 #11
0
ファイル: lemmatizers.py プロジェクト: silviupanaite/NLP-Cube
 def load_dict(self, path):
     #print ("Loading lemma dictionary")
     with fopen(path, "r") as f:
         lines = f.readlines()
         for line in lines:
             parts = line.strip().split('\t')
             if len(parts) == 5:
                 if sys.version_info[0] == 2:
                     word = unicode(parts[0],
                                    'utf-8').lower().encode('utf-8')
                 else:
                     word = parts[0].lower()
                 upos = parts[1]
                 key = word + '\t' + upos
                 self.word2lemma[key] = parts[4]
コード例 #12
0
ファイル: model_store.py プロジェクト: silviupanaite/NLP-Cube
 def _download_with_progress_bar(self, url, local_filename):
     r = requests.get(url, stream=True)
     total_size = int(r.headers['Content-Length'].strip())
     current_size = 0
     #request_content = []
     f = fopen(local_filename, 'wb')
     for buf in r.iter_content(4096 * 16):
         if buf:
             #request_content.append(buf)
             f.write(buf)
             current_size += len(buf)
             done = int(40 * current_size / total_size)
             sys.stdout.write(
                 "\r[%s%s] %3.1f%%, downloading %.2f/%.2f MB ..." %
                 ('=' * done, ' ' *
                  (40 - done), 100 * current_size / total_size,
                  current_size / 1024 / 1024, total_size / 1024 / 1024))
             sys.stdout.flush()
     #return b"".join(request_content)
     f.close()
コード例 #13
0
    def eval(self, dataset, filename=None):
        total_bleu = 0.0
        last_proc = 0
        iSeq = 0
        if filename is not None:
            f = fopen(filename,"w",encoding="utf-8")            

        for seq in dataset.sequences:
            proc = int((iSeq + 1) * 100 / len(dataset.sequences))
            if proc % 5 == 0 and proc != last_proc:
                last_proc = proc
                sys.stdout.write(" " + str(proc))
                sys.stdout.flush()
            iSeq += 1

            hyp = self.translator.translate(seq.src)
            ref = [entry.word for entry in seq.dst]
            hyp = list(hyp)
            ref = list(ref)
            # print "hyp=",hyp
            # print "ref=",ref
            # print "\n\n\n\n"
            # sys.stdout.flush()
            if filename is not None:
                for entry in seq.src:
                    f.write(entry.word + " ")
                f.write("\n")
                for entry in seq.dst:
                    f.write(entry.word + " ")
                f.write("\n")
                for word in hyp:
                    f.write(word.encode('utf-8') + " ")
                f.write("\n\n")

            if len(ref) >= 4 and len(hyp) >= 4:
                score = nltk.translate.bleu_score.sentence_bleu([ref], hyp)
                total_bleu += score
        if filename is not None:
            f.close()
        return total_bleu / len(dataset.sequences)
コード例 #14
0
ファイル: config.py プロジェクト: silviupanaite/NLP-Cube
 def save(self, filename):
     """Save configuration to file."""
     sorted_dict = collections.OrderedDict(sorted(self.__dict__.items()))  # sort dictionary
     if sys.version_info[0] == 2:
         config = ConfigParser.ConfigParser()
     else:
         config = configparser.ConfigParser()
     config.add_section(self.__config__)  # write header
     if sys.version_info[0] == 2:
         items = sorted_dict.iteritems()
     else:
         items = sorted_dict.items()
     for k, v in items:  # for python3 use .items()
         if not k.startswith("_"):  # write only non-private properties
             if isinstance(v, float):  # if we are dealing with a float
                 str_v = str(v)
                 if "e" not in str_v and "." not in str_v:  # stop possible confusion with an int by appending a ".0"
                     v = str_v + ".0"
             v = str(v)
             config.set(self.__config__, k, v)
     with fopen(filename, 'w') as cfgfile:
         config.write(cfgfile)
コード例 #15
0
ファイル: encodings.py プロジェクト: silviupanaite/NLP-Cube
    def load(self, filename):
        # We only read character2int, labels, holistic words and label2int here. word_list should be recomputed for every dataset (if deemed necessary)
        with fopen(filename,"r") as f:
            line = f.readline()

            num_labels = int(line.split(" ")[1])
            if self.verbose:
                print ("Loading labels " + str(num_labels))
            self.labels = [""] * num_labels
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.label2int[key] = value
                self.labels[value] = key

            line = f.readline()
            num_characters = int(line.split(" ")[1])
            self.characters = [""] * num_characters
            if self.verbose:
                print ("Loading characters " + str(num_characters))
            for _ in range(num_characters):
                line = f.readline()
                parts = line.split("\t")
                import sys
                if sys.version_info[0] == 2:
                    key = parts[0].decode('utf-8')
                else:
                    key = parts[0]
                value = int(parts[1])
                self.char2int[key] = value
                self.characters[value] = key
            line = f.readline()
            num_words = int(line.split(" ")[1])
            if self.verbose:
                print ("Loading words " + str(num_words))
            for _x in range(num_words):
                line = f.readline()
                parts = line.split("\t")
                import sys
                if sys.version_info[0] == 2:
                    key = parts[0].decode('utf-8')
                else:
                    key = parts[0]
                value = int(parts[1])
                self.word2int[key] = value

            # morphological attributes
            line = f.readline()
            num_labels = int(line.split(" ")[1])
            if self.verbose:
                print ("Loading upos " + str(num_labels))
            self.upos_list = [""] * num_labels
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.upos2int[key] = value
                self.upos_list[value] = key

            line = f.readline()
            num_labels = int(line.split(" ")[1])
            self.xpos_list = [""] * num_labels
            if self.verbose:
                print ("Loading xpos " + str(num_labels))
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.xpos2int[key] = value
                self.xpos_list[value] = key

            line = f.readline()
            num_labels = int(line.split(" ")[1])
            self.attrs_list = [""] * num_labels
            if self.verbose:
                print ("Loading attrs " + str(num_labels))
            for _ in range(num_labels):
                line = f.readline()
                parts = line.split("\t")
                key = parts[0]
                value = int(parts[1])
                self.attrs2int[key] = value
                self.attrs_list[value] = key
            f.close()
コード例 #16
0
ファイル: main.py プロジェクト: adobe/NLP-Cube
def parse_run(params):
    sys.stdout.write("\nINPUT FILE: " + params.input_file)
    sys.stdout.write("\nOUTPUT FILE: " + params.output_file)
    sys.stdout.write("\nMODELS FILE: " + params.models + "\n")
    sys.stdout.flush()

    components = params.run.split(",")
    tokenize = True if "tokenizer" in components else False
    compound = True if "compound" in components else False
    lemmatize = True if "lemmatizer" in components else False
    tag = True if "tagger" in components else False
    parse = True if "parser" in components else False

    # common elements load
    sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n")
    embeddings = WordEmbeddings()
    embeddings.read_from_file(params.embeddings, None)

    encodings = None
    if tokenize == True:
        if not os.path.isfile(
                os.path.join(params.models, "tokenizer-tok.bestAcc")):
            sys.stdout.write(
                "\n\tTokenizer model not found! (" +
                os.path.join(params.models, "tokenizer-tok.bestAcc") + ")")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTokenization enabled.\n")
        tokenizer_encodings = Encodings(verbose=False)
        tokenizer_encodings.load(
            os.path.join(params.models, "tokenizer.encodings"))
    if compound == True:
        if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")):
            sys.stdout.write("\n\tCompound word expander model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tCompound word expander enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if lemmatize == True:
        if not os.path.isfile(os.path.join(
                params.models, "lemmatizer.bestACC")) and not os.path.isfile(
                    os.path.join(params.models, "lemmatizer.bestAcc")):
            sys.stdout.write("\n\tLemmatization model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tLemmatization enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if tag == True:
        if not os.path.isfile(os.path.join(params.models,
                                           "tagger.bestOVERALL")):
            sys.stdout.write("\n\tTagger model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTagger enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "tagger.encodings"))
    if parse == True:
        if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")):
            sys.stdout.write("\n\tParser model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tParser enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "parser.encodings"))

    sequences = None
    if tokenize:
        sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()

        from io_utils.config import TieredTokenizerConfig
        from generic_networks.tokenizers import TieredTokenizer
        config = TieredTokenizerConfig(
            os.path.join(params.models, "tokenizer.conf"))
        tokenizer_object = TieredTokenizer(config,
                                           tokenizer_encodings,
                                           embeddings,
                                           runtime=True)
        tokenizer_object.load(os.path.join(params.models, "tokenizer"))

        with fopen(params.input_file, 'r') as file:
            lines = file.readlines()
        # analyze use of spaces in first part of the file
        test = ""
        useSpaces = " "
        cnt = 0
        while True:
            test = test + lines[cnt]
            # print(lines[cnt])
            if cnt >= len(lines) or cnt > 5:
                break
            cnt += 1
        if float(test.count(' ')) / float(len(test)) < 0.02:
            useSpaces = ""
        # print (str(float(test.count(' '))/float(len(test))))
        i = -1
        input_string = ""
        sequences = []
        while i < len(lines) - 1:
            i += 1
            input_string = input_string + lines[i].replace("\r", "").replace(
                "\n", "").strip() + useSpaces
            if lines[i].strip() == "" or i == len(lines) - 1:  # end of block
                if input_string.strip() != "":
                    sequences += tokenizer_object.tokenize(input_string)
                input_string = ""

        del tokenizer_object  # free memory
    else:
        ds = Dataset(params.input_file)
        sequences = ds.sequences
    sys.stdout.write(" done\n")
    sys.stdout.flush()

    if compound:
        sys.stdout.write("\nCompound word expanding " + params.input_file +
                         " ... \n\t")
        sys.stdout.flush()
        from generic_networks.token_expanders import CompoundWordExpander
        from io_utils.config import CompoundWordConfig
        config = CompoundWordConfig(
            os.path.join(params.models, "compound.conf"))
        compoundwordexpander_object = CompoundWordExpander(config,
                                                           encodings,
                                                           embeddings,
                                                           runtime=True)
        compoundwordexpander_object.load(
            os.path.join(params.models, "compound.bestAcc"))
        sequences = compoundwordexpander_object.expand_sequences(sequences)
        del compoundwordexpander_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if parse == True:
        sys.stdout.write("\nParsing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import ParserConfig
        from generic_networks.parsers import BDRNNParser
        config = ParserConfig(os.path.join(params.models, "parser.conf"))
        parser_object = BDRNNParser(config,
                                    encodings,
                                    embeddings,
                                    runtime=True)
        parser_object.load(os.path.join(params.models, "parser.bestUAS"))
        sequences = parser_object.parse_sequences(sequences)
        del parser_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if tag == True:
        sys.stdout.write("\nTagging " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import TaggerConfig
        from generic_networks.taggers import BDRNNTagger
        config = TaggerConfig(os.path.join(params.models, "tagger.conf"))
        tagger_object_UPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS"))
        tagger_object_XPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS"))
        tagger_object_ATTRS = BDRNNTagger(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        tagger_object_ATTRS.load(
            os.path.join(params.models, "tagger.bestATTRS"))

        new_sequences = []
        for sequence in sequences:
            new_sequence = copy.deepcopy(sequence)
            predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence)
            predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence)
            predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence)
            for entryIndex in range(len(sequence)):
                new_sequence[entryIndex].upos = predicted_tags_UPOS[
                    entryIndex][0]
                new_sequence[entryIndex].xpos = predicted_tags_XPOS[
                    entryIndex][1]
                new_sequence[entryIndex].attrs = predicted_tags_ATTRS[
                    entryIndex][2]
            new_sequences.append(new_sequence)
        sequences = copy.deepcopy(new_sequences)
        del tagger_object_UPOS  # free memory
        del tagger_object_XPOS  # free memory
        del tagger_object_ATTRS  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if lemmatize:
        sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from generic_networks.lemmatizers import FSTLemmatizer
        from io_utils.config import LemmatizerConfig
        config = LemmatizerConfig(
            os.path.join(params.models, "lemmatizer.conf"))
        lemmatizer_object = FSTLemmatizer(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        if os.path.isfile(os.path.join(params.models, "lemmatizer.bestACC")):
            lemmatizer_object.load(
                os.path.join(params.models, "lemmatizer.bestACC"))
        else:
            lemmatizer_object.load(
                os.path.join(params.models, "lemmatizer.bestAcc"))
        sequences = lemmatizer_object.lemmatize_sequences(sequences)
        del lemmatizer_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    output_dataset = Dataset()
    output_dataset.sequences = sequences
    output_dataset.write(params.output_file)