def parse_run(params): sys.stdout.write("\nINPUT FILE: " + params.input_file) sys.stdout.write("\nOUTPUT FILE: " + params.output_file) sys.stdout.write("\nMODELS FILE: " + params.models + "\n") sys.stdout.flush() components = params.run.split(",") tokenize = True if "tokenizer" in components else False compound = True if "compound" in components else False lemmatize = True if "lemmatizer" in components else False tag = True if "tagger" in components else False parse = True if "parser" in components else False # common elements load sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n") embeddings = WordEmbeddings() embeddings.read_from_file(params.embeddings, None) encodings = None if tokenize == True: if not os.path.isfile( os.path.join(params.models, "tokenizer-tok.bestAcc")): sys.stdout.write( "\n\tTokenizer model not found! (" + os.path.join(params.models, "tokenizer-tok.bestAcc") + ")") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTokenization enabled.\n") tokenizer_encodings = Encodings(verbose=False) tokenizer_encodings.load( os.path.join(params.models, "tokenizer.encodings")) if compound == True: if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")): sys.stdout.write("\n\tCompound word expander model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tCompound word expander enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if lemmatize == True: if not os.path.isfile(os.path.join(params.models, "lemmatizer.bestACC")): sys.stdout.write("\n\tLemmatization model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tLemmatization enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "lemmatizer.encodings")) if tag == True: if not os.path.isfile(os.path.join(params.models, "tagger.bestOVERALL")): sys.stdout.write("\n\tTagger model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tTagger enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "tagger.encodings")) if parse == True: if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")): sys.stdout.write("\n\tParser model not found!") sys.stdout.flush() sys.exit(1) sys.stdout.write("\n\tParser enabled.\n") if encodings == None: encodings = Encodings(verbose=False) encodings.load(os.path.join(params.models, "parser.encodings")) sequences = None if tokenize: sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TieredTokenizerConfig from generic_networks.tokenizers import TieredTokenizer config = TieredTokenizerConfig( os.path.join(params.models, "tokenizer.conf")) tokenizer_object = TieredTokenizer(config, tokenizer_encodings, embeddings, runtime=True) tokenizer_object.load(os.path.join(params.models, "tokenizer")) with open(params.input_file, 'r') as file: lines = file.readlines() # analyze use of spaces in first part of the file test = "" useSpaces = " " cnt = 0 while True: test = test + lines[cnt] # print(lines[cnt]) if cnt >= len(lines) or cnt > 5: break cnt += 1 if float(test.count(' ')) / float(len(test)) < 0.02: useSpaces = "" # print (str(float(test.count(' '))/float(len(test)))) i = -1 input_string = "" sequences = [] while i < len(lines) - 1: i += 1 input_string = input_string + lines[i].replace("\r", "").replace( "\n", "").strip() + useSpaces if lines[i].strip() == "" or i == len(lines) - 1: # end of block if input_string.strip() != "": sequences += tokenizer_object.tokenize(input_string) input_string = "" del tokenizer_object # free memory else: ds = Dataset(params.input_file) sequences = ds.sequences sys.stdout.write(" done\n") sys.stdout.flush() if compound: sys.stdout.write("\nCompound word expanding " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.token_expanders import CompoundWordExpander from io_utils.config import CompoundWordConfig config = CompoundWordConfig( os.path.join(params.models, "compound.conf")) compoundwordexpander_object = CompoundWordExpander(config, encodings, embeddings, runtime=True) compoundwordexpander_object.load( os.path.join(params.models, "compound.bestAcc")) sequences = compoundwordexpander_object.expand_sequences(sequences) del compoundwordexpander_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if parse == True: sys.stdout.write("\nParsing " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import ParserConfig from generic_networks.parsers import BDRNNParser config = ParserConfig(os.path.join(params.models, "parser.conf")) parser_object = BDRNNParser(config, encodings, embeddings, runtime=True) parser_object.load(os.path.join(params.models, "parser.bestUAS")) sequences = parser_object.parse_sequences(sequences) del parser_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() if tag == True: sys.stdout.write("\nTagging " + params.input_file + " ... \n\t") sys.stdout.flush() from io_utils.config import TaggerConfig from generic_networks.taggers import BDRNNTagger config = TaggerConfig(os.path.join(params.models, "tagger.conf")) tagger_object_UPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS")) tagger_object_XPOS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS")) tagger_object_ATTRS = BDRNNTagger(config, encodings, embeddings, runtime=True) tagger_object_ATTRS.load( os.path.join(params.models, "tagger.bestATTRS")) new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence) predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence) predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence) for entryIndex in range(len(sequence)): new_sequence[entryIndex].upos = predicted_tags_UPOS[ entryIndex][0] new_sequence[entryIndex].xpos = predicted_tags_XPOS[ entryIndex][1] new_sequence[entryIndex].attrs = predicted_tags_ATTRS[ entryIndex][2] new_sequences.append(new_sequence) sequences = copy.deepcopy(new_sequences) del tagger_object_UPOS # free memory del tagger_object_XPOS # free memory del tagger_object_ATTRS # free memory sys.stdout.write(" done\n") sys.stdout.flush() if lemmatize: sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t") sys.stdout.flush() from generic_networks.lemmatizers import FSTLemmatizer from io_utils.config import LemmatizerConfig config = LemmatizerConfig( os.path.join(params.models, "lemmatizer.conf")) lemmatizer_object = FSTLemmatizer(config, encodings, embeddings, runtime=True) lemmatizer_object.load( os.path.join(params.models, "lemmatizer.bestACC")) sequences = lemmatizer_object.lemmatize_sequences(sequences) del lemmatizer_object # free memory sys.stdout.write(" done\n") sys.stdout.flush() output_dataset = Dataset() output_dataset.sequences = sequences output_dataset.write(params.output_file)
new_sequences = [] for sequence in sequences: new_sequence = copy.deepcopy(sequence) predicted_tags_UPOS = self.models[PipelineComponents.TAGGER][0].tag(new_sequence) predicted_tags_XPOS = self.models[PipelineComponents.TAGGER][1].tag(new_sequence) predicted_tags_ATTRS = self.models[PipelineComponents.TAGGER][2].tag(new_sequence) for entryIndex in range(len(sequence)): new_sequence[entryIndex].upos = predicted_tags_UPOS[entryIndex][0] new_sequence[entryIndex].xpos = predicted_tags_XPOS[entryIndex][1] new_sequence[entryIndex].attrs = predicted_tags_ATTRS[entryIndex][2] new_sequences.append(new_sequence) sequences = new_sequences if PipelineComponents.LEMMATIZER in pipeline and self.lemmatizer_enabled: sequences = self.models[PipelineComponents.LEMMATIZER].lemmatize_sequences(sequences) return sequences if __name__ == "__main__": cube = Cube() cube.load('ro') sequences = cube.process_text(text="ana are mere dar nu are pere și mănâncă miere.") sys.stdout.write("\n\n\n") from io_utils.conll import Dataset ds = Dataset() ds.sequences = sequences ds.write_stdout()