Example #1
0
 def __init__(self,
              language,
              nlpwrapper=u"nltk",
              downloader=u"youtube",
              parameters=PARAMETERS,
              verbose=VERBOSE):
     self.language = language
     self.nlpwrapper = nlpwrapper
     self.downloader = downloader
     self.parameters = parameters
     self.verbose = verbose
     self.nlpe = NLPEngine(preload=[(self.language, self.nlpwrapper)])
     self.train_data = None
     self.trainer = None
Example #2
0
class CRFTrainer(object):
    """
    TBW
    """

    PARAMETERS = {
        # algorithms: lbfgs, l2sgd, ap, pa, arow
        "algorithm": "arow",
        "max_iterations": 5000,
        # "feature.possible_states": True,
        # "feature.possible_transitions": True,
    }
    """ Parameters for the trainer from pycrfsuite """

    VERBOSE = False
    """ Verbosity of the trainer """
    def __init__(self,
                 language,
                 nlpwrapper=u"nltk",
                 downloader=u"youtube",
                 parameters=PARAMETERS,
                 verbose=VERBOSE):
        self.language = language
        self.nlpwrapper = nlpwrapper
        self.downloader = downloader
        self.parameters = parameters
        self.verbose = verbose
        self.nlpe = NLPEngine(preload=[(self.language, self.nlpwrapper)])
        self.train_data = None
        self.trainer = None

    @property
    def train_data_cc(self):
        if self.train_data is None:
            return 0
        return len(self.train_data)

    @property
    def train_data_lines(self):
        if self.train_data is None:
            return 0
        return sum([len(cc) for cc in self.train_data])

    def _read_files(self, input_file_paths):
        #
        # NOTE in theory we could account for things like the position
        #      of a token w.r.t. the whole document or the sentence it belongs
        #      however here we make an assumption of independence
        #      of the split point from those facts
        #
        examples = []
        for ifp in input_file_paths:
            print(u".")
            if os.path.isfile(ifp):
                doc = Downloader.read_closed_captions(
                    ifp, {u"downloader": u"youtube"})
                self.nlpe.analyze(doc, wrapper=self.nlpwrapper)
                for sentence in doc.sentences:
                    features = sentence_to_features(sentence)
                    labels = sentence_to_labels(sentence)
                    example = (sentence, features, labels)
                    examples.append(example)
        return examples

    def load_data(self, obj):
        """
        TBW
        """
        if isinstance(obj, list):
            # parse the given list of files
            input_file_paths = obj
            self.train_data = self._read_files(input_file_paths)
        else:
            # try loading from pickle
            input_file_path = obj
            self.train_data = pickle.load(io.open(input_file_path, "rb"))

    def dump_data(self, dump_file_path):
        """
        TBW
        """
        pickle.dump(self.train_data, io.open(dump_file_path, "wb"))

    def train(self, model_file_path):
        """
        TBW
        """
        # create a trainer object
        import pycrfsuite
        self.trainer = pycrfsuite.Trainer(
            algorithm=self.parameters["algorithm"], verbose=self.verbose)

        # append training data
        for sentence, features, labels in self.train_data:
            # each example is a triplet (sentence, features, labels)
            # where:
            # - sentence is a TokenizedSentenceSpan
            # - features is a list of dicts
            # - labels is a list of labels (e.g., LABEL_MIDDLE or LABEL_LAST)
            self.trainer.append(features, labels)

        # do the actual training
        self.trainer.train(model_file_path)

        # return the path to the model file
        return model_file_path

    def trainer_info(self):
        if self.trainer is None:
            return None
        return self.trainer.info()
Example #3
0
class CRFTrainer(object):
    """
    TBW
    """

    PARAMETERS = {
        "max_iterations": 50,
        "feature.possible_transitions": True,
    }
    """ Parameters for the trainer from pycrfsuite """

    VERBOSE = False
    """ Verbosity of the trainer """

    LABEL_NOT_LAST = u"_"
    """ Label for a token that is not the last of a line """

    LABEL_LAST = u"E"
    """ Label for a token that is the last of a line """
    def __init__(self,
                 language,
                 nlpwrapper=u"pattern",
                 downloader=u"youtube",
                 parameters=PARAMETERS,
                 verbose=VERBOSE):
        self.language = language
        self.nlpwrapper = nlpwrapper
        self.downloader = downloader
        self.parameters = parameters
        self.verbose = verbose
        self.nlpe = NLPEngine(preload=[(self.language, self.nlpwrapper)])
        self.train_data = None
        self.trainer = None

    def _read_files(self, input_file_paths):
        def _annotated_sentence_to_lines(tokens):
            lines = []
            cl = []
            for t in tokens:
                if t.is_special:
                    if len(cl) > 0:
                        lines.append(cl)
                        cl = []
                else:
                    cl.append(t)
            if len(cl) > 0:
                lines.append(cl)
            return lines

        examples = []
        for ifp in input_file_paths:
            print(u".")
            if os.path.isfile(ifp):
                doc = Downloader.read_closed_captions(
                    ifp, {u"downloader": u"youtube"})
                self.nlpe.analyze(doc, wrapper=self.nlpwrapper)
                for sentence in doc.sentences:
                    # print(sentence.string(eol=u"|", eos=u"").strip())
                    # sentence is a Span object
                    # sentence.elements is a list of Token objects
                    lines = _annotated_sentence_to_lines(sentence.elements)
                    for line in lines:
                        # all tokens get "add" label,
                        # except the last one, which gets the "end" label
                        labels = [self.LABEL_NOT_LAST] * len(line)
                        labels[-1] = self.LABEL_LAST
                        # convert the list of
                        features = tokens_to_features(line)
                        example = (features, labels)
                        # print(example)
                        examples.append(example)
        return examples

    def load_data(self, obj):
        """
        TBW
        """
        if isinstance(obj, list):
            # parse the given list of files
            input_file_paths = obj
            self.train_data = self._read_files(input_file_paths)
        else:
            # try loading from pickle
            input_file_path = obj
            self.train_data = pickle.load(io.open(input_file_path, "rb"))

    def dump_data(self, dump_file_path):
        """
        TBW
        """
        pickle.dump(self.train_data, io.open(dump_file_path, "wb"))

    def train(self, model_file_path):
        """
        TBW
        """
        # create a trainer object
        self.trainer = pycrfsuite.Trainer(verbose=self.verbose)

        # append training data
        for feature_seq, label_seq in self.train_data:
            self.trainer.append(feature_seq, label_seq)

        # do the actual training
        self.trainer.train(model_file_path)

        # return the path to the model file
        return model_file_path

    def trainer_info(self):
        if self.trainer is None:
            return None
        return self.trainer.info()