Ejemplo n.º 1
0
 def run_sequential(self):
     params = self.job['params']
     # init access to ES DB
     db = TMDbApi()
     Task.save_segments(
         db.generate((params['slang'], params['tlang']), params['plang'],
                     params['domain']))
Ejemplo n.º 2
0
 def run_sequential(self):
     params = self.job['params']
     parser = TMXParser(params['file'],
                        domain=params['domain'],
                        lang_pairs=params.get('lang_pairs', []),
                        username=self.job['username'])
     Task.save_segments(parser.parse())
Ejemplo n.º 3
0
        iob = []
        is_inside = False
        for w in pos:
            if not re.search("<.*>", w):
                if is_inside:
                    iob.append("{}/I-T".format(w))
                    is_inside = False
                else:
                    iob.append("{}/O".format(w))
            elif iob and w == tag:
                iob[-1] = iob[-1].replace('/O', '/B-T')
                is_inside = True
            else:
                # Other tags - skip them
                pass
        return " ".join(iob)

    def is_self_closing_tag(self, tag):
        return re.match('<[^<>]+/>', tag)


if __name__ == "__main__":
    from Config.Config import G_CONFIG
    G_CONFIG.config_logging()

    task = Task(sys.argv[1])
    # Launch RDD parallel processing
    task.get_rdd().mapPartitionsWithIndex(PosTagTask(task)).foreachPartition(
        Task.save_segments)
    task.finalize()
Ejemplo n.º 4
0
        # Explicit sign?
        sign = diff.startswith(('-', '+'))

        diff = float(diff)
        # Make sure comparison is done on the requested order
        if diff < 0:
            diff = -diff
            counts = counts[::-1]
        # Actual comparison
        d = counts[1] - counts[0]
        if not sign: d = abs(d)
        #logging.debug("RULE: {}, COUNTS: {}, DIFF: {}, SIGN: {}, ACTUAL DIFF: {}".format(self.name, counts, diff, sign, d))
        if percent:
            if not counts[0]: return True  # avoid division by zero
            return (d / counts[0]) * 100 >= diff
        else:
            return d >= diff


if __name__ == "__main__":
    from Config.Config import G_CONFIG
    G_CONFIG.config_logging()

    task = Task(sys.argv[1])
    # Launch RDD parallel processing
    #task.get_rdd().mapPartitionsWithIndex(CleanTask(task)).foreachPartition(Task.save_segments)
    # Run sequentiak
    Task.maintain_segments(CleanTask(task), task.get_langs(),
                           task.job['params']['filter'])
    task.finalize()
Ejemplo n.º 5
0
            if len(batch_mget) >= self.BATCH_SIZE:
                for segment in db._generate_batch(batch_mget, self.domains):
                    yield segment
                batch_mget = []
        # Generate segments for remaining incomplete batch
        for segment in db._generate_batch(batch_mget, self.domains):
            yield segment

    def run_sequential(self):
        params = self.job['params']
        # init access to ES DB
        db = TMDbApi()
        Task.save_segments(
            db.generate((params['slang'], params['tlang']), params['plang'],
                        params['domain']))


if __name__ == "__main__":
    from Config.Config import G_CONFIG
    G_CONFIG.config_logging()

    task = Task(sys.argv[1])
    #task.get_rdd_generate().mapPartitionsWithIndex(GenerateTask(task)).foreachPartition(Task.save_segments)
    rdd = task.get_rdd_generate().mapPartitionsWithIndex(GenerateTask(task))
    Task.save_segments(
        rdd.toLocalIterator()
    )  # save partitions sequentially as we have already bulk parallelization in save_segments()

    #task.run_sequential()
    task.finalize()
Ejemplo n.º 6
0
 def run_sequential(self):
     Task.delete_segments(self, self.langs, self.job['params']['filter'],
                          self.job['params']['duplicates_only'])