Exemple #1
0
def parse_run(params):
    sys.stdout.write("\nINPUT FILE: " + params.input_file)
    sys.stdout.write("\nOUTPUT FILE: " + params.output_file)
    sys.stdout.write("\nMODELS FILE: " + params.models + "\n")
    sys.stdout.flush()

    components = params.run.split(",")
    tokenize = True if "tokenizer" in components else False
    compound = True if "compound" in components else False
    lemmatize = True if "lemmatizer" in components else False
    tag = True if "tagger" in components else False
    parse = True if "parser" in components else False

    # common elements load
    sys.stdout.write("\nLoading embeddings : " + params.embeddings + " ...\n")
    embeddings = WordEmbeddings()
    embeddings.read_from_file(params.embeddings, None)

    encodings = None
    if tokenize == True:
        if not os.path.isfile(
                os.path.join(params.models, "tokenizer-tok.bestAcc")):
            sys.stdout.write(
                "\n\tTokenizer model not found! (" +
                os.path.join(params.models, "tokenizer-tok.bestAcc") + ")")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTokenization enabled.\n")
        tokenizer_encodings = Encodings(verbose=False)
        tokenizer_encodings.load(
            os.path.join(params.models, "tokenizer.encodings"))
    if compound == True:
        if not os.path.isfile(os.path.join(params.models, "compound.bestAcc")):
            sys.stdout.write("\n\tCompound word expander model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tCompound word expander enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if lemmatize == True:
        if not os.path.isfile(os.path.join(params.models,
                                           "lemmatizer.bestACC")):
            sys.stdout.write("\n\tLemmatization model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tLemmatization enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "lemmatizer.encodings"))
    if tag == True:
        if not os.path.isfile(os.path.join(params.models,
                                           "tagger.bestOVERALL")):
            sys.stdout.write("\n\tTagger model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tTagger enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "tagger.encodings"))
    if parse == True:
        if not os.path.isfile(os.path.join(params.models, "parser.bestUAS")):
            sys.stdout.write("\n\tParser model not found!")
            sys.stdout.flush()
            sys.exit(1)
        sys.stdout.write("\n\tParser enabled.\n")
        if encodings == None:
            encodings = Encodings(verbose=False)
            encodings.load(os.path.join(params.models, "parser.encodings"))

    sequences = None
    if tokenize:
        sys.stdout.write("\nTokenizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()

        from io_utils.config import TieredTokenizerConfig
        from generic_networks.tokenizers import TieredTokenizer
        config = TieredTokenizerConfig(
            os.path.join(params.models, "tokenizer.conf"))
        tokenizer_object = TieredTokenizer(config,
                                           tokenizer_encodings,
                                           embeddings,
                                           runtime=True)
        tokenizer_object.load(os.path.join(params.models, "tokenizer"))

        with open(params.input_file, 'r') as file:
            lines = file.readlines()
        # analyze use of spaces in first part of the file
        test = ""
        useSpaces = " "
        cnt = 0
        while True:
            test = test + lines[cnt]
            # print(lines[cnt])
            if cnt >= len(lines) or cnt > 5:
                break
            cnt += 1
        if float(test.count(' ')) / float(len(test)) < 0.02:
            useSpaces = ""
        # print (str(float(test.count(' '))/float(len(test))))
        i = -1
        input_string = ""
        sequences = []
        while i < len(lines) - 1:
            i += 1
            input_string = input_string + lines[i].replace("\r", "").replace(
                "\n", "").strip() + useSpaces
            if lines[i].strip() == "" or i == len(lines) - 1:  # end of block
                if input_string.strip() != "":
                    sequences += tokenizer_object.tokenize(input_string)
                input_string = ""

        del tokenizer_object  # free memory
    else:
        ds = Dataset(params.input_file)
        sequences = ds.sequences
    sys.stdout.write(" done\n")
    sys.stdout.flush()

    if compound:
        sys.stdout.write("\nCompound word expanding " + params.input_file +
                         " ... \n\t")
        sys.stdout.flush()
        from generic_networks.token_expanders import CompoundWordExpander
        from io_utils.config import CompoundWordConfig
        config = CompoundWordConfig(
            os.path.join(params.models, "compound.conf"))
        compoundwordexpander_object = CompoundWordExpander(config,
                                                           encodings,
                                                           embeddings,
                                                           runtime=True)
        compoundwordexpander_object.load(
            os.path.join(params.models, "compound.bestAcc"))
        sequences = compoundwordexpander_object.expand_sequences(sequences)
        del compoundwordexpander_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if parse == True:
        sys.stdout.write("\nParsing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import ParserConfig
        from generic_networks.parsers import BDRNNParser
        config = ParserConfig(os.path.join(params.models, "parser.conf"))
        parser_object = BDRNNParser(config,
                                    encodings,
                                    embeddings,
                                    runtime=True)
        parser_object.load(os.path.join(params.models, "parser.bestUAS"))
        sequences = parser_object.parse_sequences(sequences)
        del parser_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if tag == True:
        sys.stdout.write("\nTagging " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from io_utils.config import TaggerConfig
        from generic_networks.taggers import BDRNNTagger
        config = TaggerConfig(os.path.join(params.models, "tagger.conf"))
        tagger_object_UPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_UPOS.load(os.path.join(params.models, "tagger.bestUPOS"))
        tagger_object_XPOS = BDRNNTagger(config,
                                         encodings,
                                         embeddings,
                                         runtime=True)
        tagger_object_XPOS.load(os.path.join(params.models, "tagger.bestXPOS"))
        tagger_object_ATTRS = BDRNNTagger(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        tagger_object_ATTRS.load(
            os.path.join(params.models, "tagger.bestATTRS"))

        new_sequences = []
        for sequence in sequences:
            new_sequence = copy.deepcopy(sequence)
            predicted_tags_UPOS = tagger_object_UPOS.tag(new_sequence)
            predicted_tags_XPOS = tagger_object_XPOS.tag(new_sequence)
            predicted_tags_ATTRS = tagger_object_ATTRS.tag(new_sequence)
            for entryIndex in range(len(sequence)):
                new_sequence[entryIndex].upos = predicted_tags_UPOS[
                    entryIndex][0]
                new_sequence[entryIndex].xpos = predicted_tags_XPOS[
                    entryIndex][1]
                new_sequence[entryIndex].attrs = predicted_tags_ATTRS[
                    entryIndex][2]
            new_sequences.append(new_sequence)
        sequences = copy.deepcopy(new_sequences)
        del tagger_object_UPOS  # free memory
        del tagger_object_XPOS  # free memory
        del tagger_object_ATTRS  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    if lemmatize:
        sys.stdout.write("\nLemmatizing " + params.input_file + " ... \n\t")
        sys.stdout.flush()
        from generic_networks.lemmatizers import FSTLemmatizer
        from io_utils.config import LemmatizerConfig
        config = LemmatizerConfig(
            os.path.join(params.models, "lemmatizer.conf"))
        lemmatizer_object = FSTLemmatizer(config,
                                          encodings,
                                          embeddings,
                                          runtime=True)
        lemmatizer_object.load(
            os.path.join(params.models, "lemmatizer.bestACC"))
        sequences = lemmatizer_object.lemmatize_sequences(sequences)
        del lemmatizer_object  # free memory
        sys.stdout.write(" done\n")
        sys.stdout.flush()

    output_dataset = Dataset()
    output_dataset.sequences = sequences
    output_dataset.write(params.output_file)
Exemple #2
0
class Cube(object):
    def __init__(self, verbose=False):
        """
        Create an empty instance for Cube
        Before it can be used, you must call @method load with @param language_code set to your target language
        """
        self._loaded = False
        self._verbose = verbose
        self._tokenizer = None  # tokenizer object, default is None
        self._compound_word_expander = False  # compound word expander, default is None
        self._lemmatizer = False  # lemmatizer object, default is None
        self._parser = False  # parser object, default is None
        self._tagger = False  # tagger object, default is None
        self.embeddings = None  # ?? needed?
        self.metadata = ModelMetadata()
        self._model_repository = "models"
        self._embeddings_repository = os.path.join("models", "embeddings")
        #self.model_store = ModelStore() # needed???

    def load(self,
             language_code,
             version="latest",
             tokenization=True,
             compound_word_expanding=False,
             tagging=True,
             lemmatization=True,
             parsing=True):
        """
        Loads the pipeline with all available models for the target language.

        @param lang_code: Target language code. See http://opensource.adobe.com/NLP-Cube/ for available languages and their codes
        @param version: "latest" to get the latest version, or other specific version in like "1.0", "2.1", etc .
       
        """
        # Initialize a ModelStore object
        model_store_object = ModelStore(disk_path=self._model_repository)

        # Find a local model or download it if it does not exist, returning the local model folder path
        model_folder_path = model_store_object.find(lang_code=language_code,
                                                    version=version,
                                                    verbose=self._verbose)

        # Load metadata from the model
        self.metadata.read(os.path.join(model_folder_path, "metadata.json"))

        # Load embeddings
        embeddings = WordEmbeddings(verbose=False)
        if self._verbose:
            sys.stdout.write('\tLoading embeddings... \n')
        embeddings.read_from_file(os.path.join(
            self._embeddings_repository, self.metadata.embeddings_file_name),
                                  None,
                                  full_load=False)

        # 1. Load tokenizer
        if tokenization:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'tokenizer-tok.bestAcc')):
                sys.stdout.write(
                    '\tTokenization is not available on this model. \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading tokenization model ...\n')
                tokenizer_encodings = Encodings(verbose=False)
                tokenizer_encodings.load(
                    os.path.join(model_folder_path, 'tokenizer.encodings'))
                config = TieredTokenizerConfig(
                    os.path.join(model_folder_path, 'tokenizer.conf'))
                self._tokenizer = TieredTokenizer(config,
                                                  tokenizer_encodings,
                                                  embeddings,
                                                  runtime=True)
                self._tokenizer.load(
                    os.path.join(model_folder_path, 'tokenizer'))

        # 3. Load compound
        if compound_word_expanding:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'compound.bestAcc')):
                if self._verbose:  # supress warning here because many languages do not have compund words
                    sys.stdout.write(
                        '\tCompound word expansion is not available on this model. \n'
                    )
            else:
                if self._verbose:
                    sys.stdout.write(
                        '\tLoading compound word expander model ...\n')
                compound_encodings = Encodings(verbose=False)
                compound_encodings.load(
                    os.path.join(model_folder_path, 'compound.encodings'))
                config = CompoundWordConfig(
                    os.path.join(model_folder_path, 'compound.conf'))
                self._compound_word_expander = CompoundWordExpander(
                    config, compound_encodings, embeddings, runtime=True)
                self._compound_word_expander.load(
                    os.path.join(model_folder_path, 'compound.bestAcc'))

        # 4. Load lemmatizer
        if lemmatization:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'lemmatizer.bestACC')):
                sys.stdout.write(
                    '\tLemmatizer is not available on this model. \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading lemmatization model ...\n')
                lemmatizer_encodings = Encodings(verbose=False)
                lemmatizer_encodings.load(
                    os.path.join(model_folder_path, 'lemmatizer.encodings'))
                config = LemmatizerConfig(
                    os.path.join(model_folder_path, 'lemmatizer.conf'))
                self._lemmatizer = FSTLemmatizer(config,
                                                 lemmatizer_encodings,
                                                 embeddings,
                                                 runtime=True)
                self._lemmatizer.load(
                    os.path.join(model_folder_path, 'lemmatizer.bestACC'))

        # 5. Load taggers
        if tagging or lemmatization:  # we need tagging for lemmatization
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'tagger.bestUPOS')):
                sys.stdout.write(
                    '\tTagging is not available on this model. \n')
                if lemmatization:
                    sys.stdout.write(
                        '\t\tDisabling the lemmatization model due to missing tagger. \n'
                    )
                    self._lemmatizer = None
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading tagger model ...\n')
                tagger_encodings = Encodings(verbose=False)
                tagger_encodings.load(
                    os.path.join(model_folder_path, 'tagger.encodings'))
                config = TaggerConfig(
                    os.path.join(model_folder_path, 'tagger.conf'))
                self._tagger = [None, None, None]
                self._tagger[0] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[0].load(
                    os.path.join(model_folder_path, 'tagger.bestUPOS'))
                self._tagger[1] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[1].load(
                    os.path.join(model_folder_path, 'tagger.bestXPOS'))
                self._tagger[2] = BDRNNTagger(config,
                                              tagger_encodings,
                                              embeddings,
                                              runtime=True)
                self._tagger[2].load(
                    os.path.join(model_folder_path, 'tagger.bestATTRS'))

        # 6. Load parser
        if parsing:
            if not os.path.isfile(
                    os.path.join(model_folder_path, 'parser.bestUAS')):
                sys.stdout.write(
                    '\tParsing is not available on this model... \n')
            else:
                if self._verbose:
                    sys.stdout.write('\tLoading parser model ...\n')
                parser_encodings = Encodings(verbose=False)
                parser_encodings.load(
                    os.path.join(model_folder_path, 'parser.encodings'))
                config = ParserConfig(
                    os.path.join(model_folder_path, 'parser.conf'))
                self._parser = BDRNNParser(config,
                                           parser_encodings,
                                           embeddings,
                                           runtime=True)
                self._parser.load(
                    os.path.join(model_folder_path, 'parser.bestUAS'))

        self._loaded = True
        if self._verbose:
            sys.stdout.write('Model loading complete.\n\n')

    def __call__(self, text):
        if not self._loaded:
            raise Exception(
                "Cube object is initialized but no model is loaded (eg.: call cube.load('en') )"
            )

        sequences = []
        if self._tokenizer:
            # split text by lines
            input_lines = text.split("\n")
            for input_line in input_lines:
                sequences += self._tokenizer.tokenize(input_line)

        if self._compound_word_expander:
            sequences = self._compound_word_expander.expand_sequences(
                sequences)

        if self._parser:
            sequences = self._parser.parse_sequences(sequences)

        if self._tagger or self._lemmatizer:
            import copy
            new_sequences = []
            for sequence in sequences:
                new_sequence = copy.deepcopy(sequence)
                predicted_tags_UPOS = self._tagger[0].tag(new_sequence)
                predicted_tags_XPOS = self._tagger[1].tag(new_sequence)
                predicted_tags_ATTRS = self._tagger[2].tag(new_sequence)
                for entryIndex in range(len(sequence)):
                    new_sequence[entryIndex].upos = predicted_tags_UPOS[
                        entryIndex][0]
                    new_sequence[entryIndex].xpos = predicted_tags_XPOS[
                        entryIndex][1]
                    new_sequence[entryIndex].attrs = predicted_tags_ATTRS[
                        entryIndex][2]
                new_sequences.append(new_sequence)
            sequences = new_sequences

        if self._lemmatizer:
            sequences = self._lemmatizer.lemmatize_sequences(sequences)

        return sequences