コード例 #1
0
ファイル: nih_dedupe_task.py プロジェクト: hmessafi/nesta
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset='nih',
                                 aliases='health_scanner',
                                 increment_version=True)

        # Count articles from the old index
        _old_config = es_config.copy()
        _old_config['index'] = es_config['old_index']
        logging.info(f"Collected article IDs...")
        _ids = get_es_ids(es, _old_config, size=10000)
        logging.info(f"Collected {len(_ids)} IDs")
        done_ids = get_es_ids(es, es_config, size=10000)

        # Generate the job params
        job_params = []
        batches = split_batches(_ids, self.process_batch_size)
        for count, batch in enumerate(batches, 1):
            # Magical '0.3' is the lower end of the deduplication
            # fraction found by inspection
            done = sum(_id in done_ids for _id in batch) / len(batch) > 0.3
            # write batch of ids to s3
            batch_file = ''
            if not done:
                batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                          self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "bucket": self.intermediate_bucket,
                "done": done,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'in_index': es_config['old_index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': 'paper',
                'test': self.test,
                'routine_id': self.routine_id
            }

            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches "
                                "while in test mode.")
                logging.warning(job_params)
                break
        logging.info("Batch preparation completed, "
                     f"with {len(job_params)} batches")
        return job_params
コード例 #2
0
ファイル: estask.py プロジェクト: yitzikc/nesta
    def prepare(self):
        '''Chunk up elasticsearch data, and submit batch
        jobs over those chunks.'''
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # Setup elasticsearch and extract all ids
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 drop_and_recreate=False,
                                 dataset=self.dataset,
                                 increment_version=False)
        ids = get_es_ids(es, es_config, size=10000)  # All ids in this index
        ids = ids - self._done_ids  # Don't repeat done ids

        # Override the default index if specified
        es_config['index'] = (self.index if self.index is not None else
                              es_config['index'])

        # Generate the job params
        job_params = []
        batches = split_batches(ids, self.process_batch_size)
        for count, batch in enumerate(batches, 1):
            done = False  # Already taken care of with _done_ids
            # write batch of ids to s3
            batch_file = ''
            if not done:
                batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                          self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": self.sql_config_filename,
                "bucket": self.intermediate_bucket,
                "done": done,
                "count": len(ids),
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'test': self.test,
                'routine_id': self.routine_id,
                'entity_type': self.entity_type,
                **self.kwargs
            }
            job_params.append(params)
            # Test mode
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches "
                                "while in test mode.")
                logging.warning(job_params)
                break
        # Done
        logging.info("Batch preparation completed, "
                     f"with {len(job_params)} batches")
        return job_params
コード例 #3
0
    def prepare(self):
        if self.test:
            self.process_batch_size = 100
        # MySQL setup
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # Subtract off all done ids
        Base.metadata.create_all(engine)
        with db_session(engine) as session:
            result = session.query(Project.rcn).all()
            done_rcn = {r[0] for r in result}

        # Get all possible ids (or "RCN" in Cordis-speak)
        nrows = 1000 if self.test else None
        all_rcn = set(
            get_framework_ids('fp7', nrows=nrows) +
            get_framework_ids('h2020', nrows=nrows))
        all_rcn = all_rcn - done_rcn

        # Generate the job params
        batches = split_batches(all_rcn, self.process_batch_size)
        params = [{
            "batch_file":
            put_s3_batch(batch, self.intermediate_bucket, self.routine_id),
            "config":
            'mysqldb.config',
            "db_name":
            database,
            "bucket":
            self.intermediate_bucket,
            "outinfo":
            'dummy',
            "done":
            False,
            'test':
            self.test
        } for batch in batches]
        return params
コード例 #4
0
    def prepare(self):
        '''Prepare the batch job parameters'''
        db = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', db)

        with db_session(engine) as session:
            results = (session.query(
                Projects.id, func.length(Projects.abstractText)).filter(
                    Projects.abstractText is not None).distinct(
                        Projects.abstractText).all())

        # Keep documents with a length larger than the 10th percentile.
        perc = np.percentile([r[1] for r in results], 10)
        all_ids = [r.id for r in results if r[1] >= perc]

        job_params = []
        for count, batch in enumerate(split_batches(all_ids,
                                                    self.process_batch_size),
                                      start=1):
            # write batch of ids to s3
            key = f'text2vec-{self.routine_id}-{self.date}-{count}'
            batch_file = put_s3_batch(batch, self.intermediate_bucket, key)
            done = key in DONE_KEYS
            params = {
                "config": "mysqldb.config",
                "bucket": self.intermediate_bucket,
                "batch_file": batch_file,
                "db_name": db,
                "done": done,
                'outinfo':
                f"s3://{self.intermediate_bucket}/{key}",  # mark as done
                'test': self.test,
            }
            job_params.append(params)
            logging.info(params)
        return job_params
コード例 #5
0
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # MySQL setup
        self.database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', self.database)

        # Elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset='crunchbase',
                                 aliases='health_scanner')

        # Get set of existing ids from elasticsearch via scroll
        scanner = scan(es,
                       query={"_source": False},
                       index=es_config['index'],
                       doc_type=es_config['type'])
        existing_ids = {s['_id'] for s in scanner}
        logging.info(f"Collected {len(existing_ids)} existing in "
                     "Elasticsearch")

        # Get set of all organisations from mysql
        all_orgs = list(all_org_ids(engine))
        logging.info(f"{len(all_orgs)} organisations in MySQL")

        # Remove previously processed
        orgs_to_process = list(org for org in all_orgs
                               if org not in existing_ids)
        logging.info(f"{len(orgs_to_process)} to be processed")

        job_params = []
        for count, batch in enumerate(
                split_batches(orgs_to_process, self.process_batch_size), 1):
            logging.info(f"Processing batch {count} with size {len(batch)}")

            # write batch of ids to s3
            batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                      'crunchbase_to_es')
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "db_name": self.database,
                "bucket": self.intermediate_bucket,
                "done": False,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': 'company',
                "test": self.test
            }

            logging.info(params)
            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches while in "
                                "test mode.")
                break

        logging.warning("Batch preparation completed, "
                        f"with {len(job_params)} batches")
        return job_params
コード例 #6
0
ファイル: sql2estask.py プロジェクト: yitzikc/nesta
    def prepare(self):
        if self.test:
            self.process_batch_size = 1000
            logging.warning("Batch size restricted to "
                            f"{self.process_batch_size}"
                            " while in test mode")

        # MySQL setup
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, self.db_section,
                                  database)

        # Elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset=self.dataset,
                                 aliases=self.aliases)

        # Get set of existing ids from elasticsearch via scroll
        existing_ids = get_es_ids(es, es_config)
        logging.info(f"Collected {len(existing_ids)} existing in "
                     "Elasticsearch")

        # Get set of all organisations from mysql
        with db_session(engine) as session:
            result = session.query(self.id_field).all()
            all_ids = {r[0] for r in result}
        logging.info(f"{len(all_ids)} organisations in MySQL")

        # Remove previously processed
        ids_to_process = (org for org in all_ids if org not in existing_ids)

        job_params = []
        for count, batch in enumerate(
                split_batches(ids_to_process, self.process_batch_size), 1):
            # write batch of ids to s3
            batch_file = put_s3_batch(batch, self.intermediate_bucket,
                                      self.routine_id)
            params = {
                "batch_file": batch_file,
                "config": 'mysqldb.config',
                "db_name": database,
                "bucket": self.intermediate_bucket,
                "done": False,
                'outinfo': es_config['host'],
                'out_port': es_config['port'],
                'out_index': es_config['index'],
                'out_type': es_config['type'],
                'aws_auth_region': es_config['region'],
                'entity_type': self.entity_type,
                'test': self.test,
                'routine_id': self.routine_id
            }
            params.update(self.kwargs)

            logging.info(params)
            job_params.append(params)
            if self.test and count > 1:
                logging.warning("Breaking after 2 batches while in "
                                "test mode.")
                logging.warning(job_params)
                break

        logging.warning("Batch preparation completed, "
                        f"with {len(job_params)} batches")
        return job_params