def run_sequential(self): params = self.job['params'] # init access to ES DB db = TMDbApi() Task.save_segments( db.generate((params['slang'], params['tlang']), params['plang'], params['domain']))
def run_sequential(self): params = self.job['params'] parser = TMXParser(params['file'], domain=params['domain'], lang_pairs=params.get('lang_pairs', []), username=self.job['username']) Task.save_segments(parser.parse())
iob = [] is_inside = False for w in pos: if not re.search("<.*>", w): if is_inside: iob.append("{}/I-T".format(w)) is_inside = False else: iob.append("{}/O".format(w)) elif iob and w == tag: iob[-1] = iob[-1].replace('/O', '/B-T') is_inside = True else: # Other tags - skip them pass return " ".join(iob) def is_self_closing_tag(self, tag): return re.match('<[^<>]+/>', tag) if __name__ == "__main__": from Config.Config import G_CONFIG G_CONFIG.config_logging() task = Task(sys.argv[1]) # Launch RDD parallel processing task.get_rdd().mapPartitionsWithIndex(PosTagTask(task)).foreachPartition( Task.save_segments) task.finalize()
# Explicit sign? sign = diff.startswith(('-', '+')) diff = float(diff) # Make sure comparison is done on the requested order if diff < 0: diff = -diff counts = counts[::-1] # Actual comparison d = counts[1] - counts[0] if not sign: d = abs(d) #logging.debug("RULE: {}, COUNTS: {}, DIFF: {}, SIGN: {}, ACTUAL DIFF: {}".format(self.name, counts, diff, sign, d)) if percent: if not counts[0]: return True # avoid division by zero return (d / counts[0]) * 100 >= diff else: return d >= diff if __name__ == "__main__": from Config.Config import G_CONFIG G_CONFIG.config_logging() task = Task(sys.argv[1]) # Launch RDD parallel processing #task.get_rdd().mapPartitionsWithIndex(CleanTask(task)).foreachPartition(Task.save_segments) # Run sequentiak Task.maintain_segments(CleanTask(task), task.get_langs(), task.job['params']['filter']) task.finalize()
if len(batch_mget) >= self.BATCH_SIZE: for segment in db._generate_batch(batch_mget, self.domains): yield segment batch_mget = [] # Generate segments for remaining incomplete batch for segment in db._generate_batch(batch_mget, self.domains): yield segment def run_sequential(self): params = self.job['params'] # init access to ES DB db = TMDbApi() Task.save_segments( db.generate((params['slang'], params['tlang']), params['plang'], params['domain'])) if __name__ == "__main__": from Config.Config import G_CONFIG G_CONFIG.config_logging() task = Task(sys.argv[1]) #task.get_rdd_generate().mapPartitionsWithIndex(GenerateTask(task)).foreachPartition(Task.save_segments) rdd = task.get_rdd_generate().mapPartitionsWithIndex(GenerateTask(task)) Task.save_segments( rdd.toLocalIterator() ) # save partitions sequentially as we have already bulk parallelization in save_segments() #task.run_sequential() task.finalize()
def run_sequential(self): Task.delete_segments(self, self.langs, self.job['params']['filter'], self.job['params']['duplicates_only'])