def __init__(self, language, nlpwrapper=u"nltk", downloader=u"youtube", parameters=PARAMETERS, verbose=VERBOSE): self.language = language self.nlpwrapper = nlpwrapper self.downloader = downloader self.parameters = parameters self.verbose = verbose self.nlpe = NLPEngine(preload=[(self.language, self.nlpwrapper)]) self.train_data = None self.trainer = None
class CRFTrainer(object): """ TBW """ PARAMETERS = { # algorithms: lbfgs, l2sgd, ap, pa, arow "algorithm": "arow", "max_iterations": 5000, # "feature.possible_states": True, # "feature.possible_transitions": True, } """ Parameters for the trainer from pycrfsuite """ VERBOSE = False """ Verbosity of the trainer """ def __init__(self, language, nlpwrapper=u"nltk", downloader=u"youtube", parameters=PARAMETERS, verbose=VERBOSE): self.language = language self.nlpwrapper = nlpwrapper self.downloader = downloader self.parameters = parameters self.verbose = verbose self.nlpe = NLPEngine(preload=[(self.language, self.nlpwrapper)]) self.train_data = None self.trainer = None @property def train_data_cc(self): if self.train_data is None: return 0 return len(self.train_data) @property def train_data_lines(self): if self.train_data is None: return 0 return sum([len(cc) for cc in self.train_data]) def _read_files(self, input_file_paths): # # NOTE in theory we could account for things like the position # of a token w.r.t. the whole document or the sentence it belongs # however here we make an assumption of independence # of the split point from those facts # examples = [] for ifp in input_file_paths: print(u".") if os.path.isfile(ifp): doc = Downloader.read_closed_captions( ifp, {u"downloader": u"youtube"}) self.nlpe.analyze(doc, wrapper=self.nlpwrapper) for sentence in doc.sentences: features = sentence_to_features(sentence) labels = sentence_to_labels(sentence) example = (sentence, features, labels) examples.append(example) return examples def load_data(self, obj): """ TBW """ if isinstance(obj, list): # parse the given list of files input_file_paths = obj self.train_data = self._read_files(input_file_paths) else: # try loading from pickle input_file_path = obj self.train_data = pickle.load(io.open(input_file_path, "rb")) def dump_data(self, dump_file_path): """ TBW """ pickle.dump(self.train_data, io.open(dump_file_path, "wb")) def train(self, model_file_path): """ TBW """ # create a trainer object import pycrfsuite self.trainer = pycrfsuite.Trainer( algorithm=self.parameters["algorithm"], verbose=self.verbose) # append training data for sentence, features, labels in self.train_data: # each example is a triplet (sentence, features, labels) # where: # - sentence is a TokenizedSentenceSpan # - features is a list of dicts # - labels is a list of labels (e.g., LABEL_MIDDLE or LABEL_LAST) self.trainer.append(features, labels) # do the actual training self.trainer.train(model_file_path) # return the path to the model file return model_file_path def trainer_info(self): if self.trainer is None: return None return self.trainer.info()
class CRFTrainer(object): """ TBW """ PARAMETERS = { "max_iterations": 50, "feature.possible_transitions": True, } """ Parameters for the trainer from pycrfsuite """ VERBOSE = False """ Verbosity of the trainer """ LABEL_NOT_LAST = u"_" """ Label for a token that is not the last of a line """ LABEL_LAST = u"E" """ Label for a token that is the last of a line """ def __init__(self, language, nlpwrapper=u"pattern", downloader=u"youtube", parameters=PARAMETERS, verbose=VERBOSE): self.language = language self.nlpwrapper = nlpwrapper self.downloader = downloader self.parameters = parameters self.verbose = verbose self.nlpe = NLPEngine(preload=[(self.language, self.nlpwrapper)]) self.train_data = None self.trainer = None def _read_files(self, input_file_paths): def _annotated_sentence_to_lines(tokens): lines = [] cl = [] for t in tokens: if t.is_special: if len(cl) > 0: lines.append(cl) cl = [] else: cl.append(t) if len(cl) > 0: lines.append(cl) return lines examples = [] for ifp in input_file_paths: print(u".") if os.path.isfile(ifp): doc = Downloader.read_closed_captions( ifp, {u"downloader": u"youtube"}) self.nlpe.analyze(doc, wrapper=self.nlpwrapper) for sentence in doc.sentences: # print(sentence.string(eol=u"|", eos=u"").strip()) # sentence is a Span object # sentence.elements is a list of Token objects lines = _annotated_sentence_to_lines(sentence.elements) for line in lines: # all tokens get "add" label, # except the last one, which gets the "end" label labels = [self.LABEL_NOT_LAST] * len(line) labels[-1] = self.LABEL_LAST # convert the list of features = tokens_to_features(line) example = (features, labels) # print(example) examples.append(example) return examples def load_data(self, obj): """ TBW """ if isinstance(obj, list): # parse the given list of files input_file_paths = obj self.train_data = self._read_files(input_file_paths) else: # try loading from pickle input_file_path = obj self.train_data = pickle.load(io.open(input_file_path, "rb")) def dump_data(self, dump_file_path): """ TBW """ pickle.dump(self.train_data, io.open(dump_file_path, "wb")) def train(self, model_file_path): """ TBW """ # create a trainer object self.trainer = pycrfsuite.Trainer(verbose=self.verbose) # append training data for feature_seq, label_seq in self.train_data: self.trainer.append(feature_seq, label_seq) # do the actual training self.trainer.train(model_file_path) # return the path to the model file return model_file_path def trainer_info(self): if self.trainer is None: return None return self.trainer.info()