Exemple #1
0
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
Exemple #2
0
 def save_to_file(self, model_file):
     """\
     Save the model to a pickle file or stream (supports GZip compression).
     """
     log_info('Saving model to file ' + str(model_file))
     fh = file_stream(model_file, mode='wb', encoding=None)
     pickle.Pickler(fh, pickle.HIGHEST_PROTOCOL).dump(self)
     fh.close()
     log_info('Model successfully saved.')
Exemple #3
0
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
Exemple #4
0
 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0, int(round(self.train_part * len(train))),
                              copy=False)
     return train
Exemple #5
0
 def load_from_file(model_file):
     """\
     Load the model from a pickle file or stream
     (supports GZip compression).
     """
     log_info('Loading model from file ' + str(model_file))
     fh = file_stream(model_file, mode='rb', encoding=None)
     unpickler = pickle.Unpickler(fh)
     model = unpickler.load()
     fh.close()
     log_info('Model loaded successfully.')
     return model
Exemple #6
0
 def load_training_set(self, filename, encoding='UTF-8'):
     """\
     Load the given training data set into memory and strip it if
     configured to via the train_part parameter.
     """
     log_info('Loading training data set from ' + str(filename) + '...')
     train = DataSet()
     train.load_from_arff(filename, encoding)
     if self.train_part < 1:
         train = train.subset(0,
                              int(round(self.train_part * len(train))),
                              copy=False)
     return train
Exemple #7
0
 def train_on_data(self, train):
     """\
     Train model on the specified training data set (which must be a loaded
     DataSet object).
     """
     log_info('Preparing data set...')
     self.data_headers = train.get_headers()
     train_vect = self.__vectorize(train)
     train_classes = self.get_classes(train)
     # if all the training data have the same class, use a dummy classifier
     if train.get_attrib(self.class_attr).num_values == 1:
         self.feature_filter = None
         self.classifier = DummyClassifier()
     # filter features
     log_info('Filtering...')
     train_filt = self.__filter_features(train_vect, train_classes)
     # train the classifier
     log_info('Training...')
     if self.use_weights:
         self.classifier.fit(train_filt,
                             train_classes,
                             sample_weight=train.inst_weights)
     else:
         self.classifier.fit(train_filt, train_classes)
     self.classifier_trained = True
     log_info('Training done.')
Exemple #8
0
 def train_on_data(self, train):
     """\
     Train model on the specified training data set (which must be a loaded
     DataSet object).
     """
     log_info('Preparing data set...')
     self.data_headers = train.get_headers()
     train_vect = self.__vectorize(train)
     train_classes = self.get_classes(train)
     # if all the training data have the same class, use a dummy classifier
     if train.get_attrib(self.class_attr).num_values == 1:
         self.feature_filter = None
         self.classifier = DummyClassifier()
     # filter features
     log_info('Filtering...')
     train_filt = self.__filter_features(train_vect, train_classes)
     # train the classifier
     log_info('Training...')
     if self.use_weights:
         self.classifier.fit(train_filt, train_classes,
                             sample_weight=train.inst_weights)
     else:
         self.classifier.fit(train_filt, train_classes)
     self.classifier_trained = True
     log_info('Training done.')
Exemple #9
0
 def apply_to(self, filename=None, string=None, language=None, selector=None):
     """
     Apply the whole scenario to a file or to a string (which should be readable by
     the first block of the scenario). If processing a string, return the result.
     """
     if filename is not None:
         # the first block is supposed to be a reader which creates the document
         log_info('Processing ' + filename)
         log_info('Applying block 1/' + str(len(self.blocks)) + ': ' +
                  self.blocks[0].__class__.__name__)
         doc = self.blocks[0].process_document(filename)
         # apply all other blocks
         for block_no, block in enumerate(self.blocks[1:], start=2):
             log_info('Applying block ' + str(block_no) + '/' +
                      str(len(self.blocks)) + ': ' + block.__class__.__name__)
             block.process_document(doc)
     elif string is not None:
         # check if we know the target language and selector
         language = language or self.global_args.get['language']
         selector = selector or self.global_args.get('selector', '')
         # the first block is supposed to be a reader which creates the document
         fh = StringIO(string)
         doc = self.blocks[0].process_document(fh)
         # apply all other blocks
         for block_no, block in enumerate(self.blocks[1:], start=2):
             log_info('Applying block ' + str(block_no) + '/' +
                      str(len(self.blocks)) + ': ' + block.__class__.__name__)
             block.process_document(doc)
         # return the text of all bundles for the specified sentence
         return "\n".join([b.get_zone(language, selector).sentence
                           for b in doc.bundles])
     else:
         raise ScenarioException('Filename or input string must be set!')
Exemple #10
0
 def process_document(self, filename):
     """\
     Read a Tecto-Template file and return its contents as
     a Document object.
     """
     fh = file_stream(filename, encoding=self.encoding)
     doc = Document(filename)
     for line in fh:
         bundle = doc.create_bundle()
         zone = bundle.create_zone(self.language, self.selector)
         ttree = zone.create_ttree()
         self.parse_line(line, ttree)
         log_info('Parsed a tree with %d nodes.' %
                  len(ttree.get_descendants()))
     fh.close()
     return doc
Exemple #11
0
 def load_blocks(self):
     "Load all blocks into memory, finding and creating class objects."
     self.blocks = []
     for block_no, block_data in enumerate(self.scenario_data, start=1):
         # create the block name and import it
         if '.' in block_data["block"]:
             class_subpath, class_name = block_data["block"].rsplit('.', 1)
             class_subpath += '.'
         else:
             class_subpath, class_name = '', block_data["block"]
         class_package = 'pytreex.block.' + class_subpath + class_name.lower()
         log_info('Loading block ' + str(block_no) + '/' +
                  str(len(self.scenario_data)) + ': ' + class_name)
         exec('import ' + class_package)
         class_obj = getattr(sys.modules[class_package], class_name)
         # create the block object
         args = self.global_args.copy()
         args.update(block_data.get("args", {}))
         self.blocks.append(class_obj(self, args))
         # load models etc.
         self.blocks[-1].load()
Exemple #12
0
 def load_blocks(self):
     "Load all blocks into memory, finding and creating class objects."
     self.blocks = []
     for block_no, block_data in enumerate(self.scenario_data, start=1):
         # create the block name and import it
         if '.' in block_data["block"]:
             class_subpath, class_name = block_data["block"].rsplit('.', 1)
             class_subpath += '.'
         else:
             class_subpath, class_name = '', block_data["block"]
         class_package = 'pytreex.block.' + class_subpath + class_name.lower(
         )
         log_info('Loading block ' + str(block_no) + '/' +
                  str(len(self.scenario_data)) + ': ' + class_name)
         exec('import ' + class_package)
         class_obj = getattr(sys.modules[class_package], class_name)
         # create the block object
         args = self.global_args.copy()
         args.update(block_data.get("args", {}))
         self.blocks.append(class_obj(self, args))
         # load models etc.
         self.blocks[-1].load()
Exemple #13
0
 def run_on_cluster(self):
     # split input files for different jobs
     job_files = [self.input_files[i::self.jobs] for i in xrange(self.jobs)]
     jobs = [Job(name=self.JOB_NAME_PREFIX + self.scenario.name)]
     work_dir = jobs[0].work_dir
     for jobnum in xrange(1, self.jobs):
         jobs.append(Job(name=self.JOB_NAME_PREFIX + self.scenario.name +
                         '-' + str(jobnum).zfill(2), work_dir=work_dir))
     log_info('Creating jobs ...')
     for job, files in zip(jobs, job_files):
         job.header += "from treex.core.run import Run\n"
         args = [self.scenario.file_path] + [os.path.abspath(file_path) for file_path in files]
         job.code = "run = Run(" + str(args) + ")\nrun.run()\n"
     log_info('Submitting jobs ...')
     for job in jobs:
         job.submit()
     log_info('Waiting for jobs ...')
     for job in jobs:
         job.wait(poll_delay=10)
     log_info('All jobs done.')
 def process_subtree(self, amrnode):
     # progress depth-first
     for child in amrnode.get_children():
         self.process_subtree(child)
     # #Separ is "+"
     if amrnode.concept == '#Separ':
         val = 0
         for child in amrnode.get_children():
             num = self.get_numeric_value(child)
             if num is None:
                 continue
             val += num
             self.rehang_children_and_remove(child)
         amrnode.concept = str(val)
         log_info('Separ: ' + amrnode.concept)
         return
     # / is "/"
     if amrnode.concept in ['/', '#Slash']:
         children = amrnode.get_children(ordered=True)
         if len(children) == 2 and all(
             [self.get_numeric_value(c) is not None for c in children]):
             val = self.get_numeric_value(children[0]) / float(
                 self.get_numeric_value(children[1]))
             amrnode.concept = str(val)
             log_info('/: ' + amrnode.concept)
             self.rehang_children_and_remove(children[0])
             self.rehang_children_and_remove(children[1])
         return
     # check if we are a number, normalize our concept name
     val = self.get_numeric_value(amrnode)
     if val is not None:
         # any numeric children = '*'
         for child in amrnode.get_children(preceding_only=True):
             num = self.get_numeric_value(child)
             if num is not None:
                 val *= num
                 self.rehang_children_and_remove(child)
                 log_info('Number child: ' + str(num))
         log_info('Number: ' + amrnode.concept)
         amrnode.concept = str(val)
Exemple #15
0
 def apply_to(self,
              filename=None,
              string=None,
              language=None,
              selector=None):
     """
     Apply the whole scenario to a file or to a string (which should be readable by
     the first block of the scenario). If processing a string, return the result.
     """
     if filename is not None:
         # the first block is supposed to be a reader which creates the document
         log_info('Processing ' + filename)
         log_info('Applying block 1/' + str(len(self.blocks)) + ': ' +
                  self.blocks[0].__class__.__name__)
         doc = self.blocks[0].process_document(filename)
         # apply all other blocks
         for block_no, block in enumerate(self.blocks[1:], start=2):
             log_info('Applying block ' + str(block_no) + '/' +
                      str(len(self.blocks)) + ': ' +
                      block.__class__.__name__)
             block.process_document(doc)
     elif string is not None:
         # check if we know the target language and selector
         language = language or self.global_args.get['language']
         selector = selector or self.global_args.get('selector', '')
         # the first block is supposed to be a reader which creates the document
         fh = StringIO(string)
         doc = self.blocks[0].process_document(fh)
         # apply all other blocks
         for block_no, block in enumerate(self.blocks[1:], start=2):
             log_info('Applying block ' + str(block_no) + '/' +
                      str(len(self.blocks)) + ': ' +
                      block.__class__.__name__)
             block.process_document(doc)
         # return the text of all bundles for the specified sentence
         return "\n".join(
             [b.get_zone(language, selector).sentence for b in doc.bundles])
     else:
         raise ScenarioException('Filename or input string must be set!')
Exemple #16
0
 def process_subtree(self, amrnode):
     # progress depth-first
     for child in amrnode.get_children():
         self.process_subtree(child)
     # #Separ is "+"
     if amrnode.concept == '#Separ':
         val = 0
         for child in amrnode.get_children():
             num = self.get_numeric_value(child)
             if num is None:
                 continue
             val += num
             self.rehang_children_and_remove(child)
         amrnode.concept = unicode(val)
         log_info('Separ: ' + amrnode.concept)
         return
     # / is "/"
     if amrnode.concept in ['/', '#Slash']:
         children = amrnode.get_children(ordered=True)
         if len(children) == 2 and all([self.get_numeric_value(c) is not None for c in children]):
             val = self.get_numeric_value(children[0]) / float(self.get_numeric_value(children[1]))
             amrnode.concept = unicode(val)
             log_info('/: ' + amrnode.concept)
             self.rehang_children_and_remove(children[0])
             self.rehang_children_and_remove(children[1])
         return
     # check if we are a number, normalize our concept name
     val = self.get_numeric_value(amrnode)
     if val is not None:
         # any numeric children = '*'
         for child in amrnode.get_children(preceding_only=True):
             num = self.get_numeric_value(child)
             if num is not None:
                 val *= num
                 self.rehang_children_and_remove(child)
                 log_info('Number child: ' + str(num))
         log_info('Number: ' + amrnode.concept)
         amrnode.concept = unicode(val)
Exemple #17
0
 def run_on_cluster(self):
     # split input files for different jobs
     job_files = [self.input_files[i::self.jobs] for i in xrange(self.jobs)]
     jobs = [Job(name=self.JOB_NAME_PREFIX + self.scenario.name)]
     work_dir = jobs[0].work_dir
     for jobnum in xrange(1, self.jobs):
         jobs.append(
             Job(name=self.JOB_NAME_PREFIX + self.scenario.name + '-' +
                 str(jobnum).zfill(2),
                 work_dir=work_dir))
     log_info('Creating jobs ...')
     for job, files in zip(jobs, job_files):
         job.header += "from treex.core.run import Run\n"
         args = [self.scenario.file_path
                 ] + [os.path.abspath(file_path) for file_path in files]
         job.code = "run = Run(" + str(args) + ")\nrun.run()\n"
     log_info('Submitting jobs ...')
     for job in jobs:
         job.submit()
     log_info('Waiting for jobs ...')
     for job in jobs:
         job.wait(poll_delay=10)
     log_info('All jobs done.')
Exemple #18
0
 def train(self, train_file, work_dir, memory=8, encoding='UTF-8'):
     """\
     Read training data, split them and train the individual models
     (in cluster jobs).
     """
     # load the entire data set
     train = self.load_training_set(train_file, encoding)
     self.data_headers = train.get_headers()
     # train a backoff model
     log_info('Training a backoff model...')
     self.backoff_model = self.__train_backoff_model(train)
     # split it
     log_info('Split...')
     train_split = train.split(eval(self.divide_func), keep_copy=False)
     jobs = []
     model_files = {}
     # save training files and create training jobs
     for key, subset in train_split.iteritems():
         fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file)
         fn = os.path.join(work_dir, os.path.basename(fn))
         subset.save_to_arff(fn, encoding)
         job, model_file = Model.create_training_job(self.config, work_dir,
                                                     fn, memory=memory,
                                                     encoding=encoding)
         jobs.append(job)
         model_files[key] = model_file
     # submit the training jobs and wait for all of them
     log_info('Submitting training jobs...')
     for job in jobs:
         job.submit()
     log_info('Waiting for jobs...')
     for job in jobs:
         job.wait()
     # load all models
     log_info('Training complete. Assembling model files...')
     for key, model_file in model_files.iteritems():
         self.models[key] = Model.load_from_file(model_file)
     self.trained = True
     log_info('Training done.')
Exemple #19
0
 def train(self, train_file, work_dir, memory=8, encoding='UTF-8'):
     """\
     Read training data, split them and train the individual models
     (in cluster jobs).
     """
     # load the entire data set
     train = self.load_training_set(train_file, encoding)
     self.data_headers = train.get_headers()
     # train a backoff model
     log_info('Training a backoff model...')
     self.backoff_model = self.__train_backoff_model(train)
     # split it
     log_info('Split...')
     train_split = train.split(eval(self.divide_func), keep_copy=False)
     jobs = []
     model_files = {}
     # save training files and create training jobs
     for key, subset in train_split.iteritems():
         fn = re.sub(r'(.arff(.gz)?)?$', '-' + key + '.arff.gz', train_file)
         fn = os.path.join(work_dir, os.path.basename(fn))
         subset.save_to_arff(fn, encoding)
         job, model_file = Model.create_training_job(self.config,
                                                     work_dir,
                                                     fn,
                                                     memory=memory,
                                                     encoding=encoding)
         jobs.append(job)
         model_files[key] = model_file
     # submit the training jobs and wait for all of them
     log_info('Submitting training jobs...')
     for job in jobs:
         job.submit()
     log_info('Waiting for jobs...')
     for job in jobs:
         job.wait()
     # load all models
     log_info('Training complete. Assembling model files...')
     for key, model_file in model_files.iteritems():
         self.models[key] = Model.load_from_file(model_file)
     self.trained = True
     log_info('Training done.')