def combine(self, job_params):
        '''Combine the outputs from the batch jobs'''

        # Retrieve the batched data
        country_data = defaultdict(dict)
        n_rows = 0
        for i, params in enumerate(job_params):
            print(i, " of ", len(job_params))
            _body = s3.S3Target(params["outinfo"]).open("rb")
            _country_data = json.loads(_body.read().decode('utf-8'))
            for country, data in _country_data.items():
                for var_name, data_row in data.items():
                    n_rows += 1
                    country_data[country][var_name] = data_row
        print(f"Got {n_rows} rows of data")

        # Merge with metadata, then flatten and clean
        country_metadata = get_worldbank_resource("countries")
        flat_country_data = flatten_country_data(country_data,
                                                 country_metadata)
        cleaned_data = clean_variable_names(flat_country_data)

        # Commit the data
        engine = get_mysql_engine("MYSQLDB", "mysqldb",
                                  self.db_config['database'])
        Base.metadata.create_all(engine)
        Session = sessionmaker(engine)
        session = Session()
        for row in cleaned_data:
            country = WorldbankCountry(**row)
            session.add(country)
        session.commit()
        session.close()
        self.output().touch()
    def requires(self):
        '''Collects the database configurations and executes the central task.'''
        logging.getLogger().setLevel(logging.INFO)
        _routine_id = f"{self.date}-{self.iso2}-{self.category}-{self.production}"

        engine = get_mysql_engine("MYSQLDB", "mysqldb",
                                  "production" if self.production else "dev")
        Base.metadata.create_all(engine)

        yield GroupDetailsTask(
            iso2=self.iso2,
            category=self.category,
            _routine_id=_routine_id,
            batchable=BATCHABLE.format("group_details"),
            env_files=[
                find_filepath_from_pathstub("/nesta/nesta"),
                find_filepath_from_pathstub("/config/mysqldb.config")
            ],
            job_def="py36_amzn1_image",
            job_name="GroupDetails-%s" % _routine_id,
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            max_live_jobs=100,
            test=(not self.production))
class TestWiktionaryNgram(unittest.TestCase):
    '''Check that the WiktionaryNgram ORM works as expected'''
    engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")
    Session = sessionmaker(engine)

    def setUp(self):
        '''Create the temporary table'''
        Base.metadata.create_all(self.engine)

    def tearDown(self):
        '''Drop the temporary table'''
        Base.metadata.drop_all(self.engine)

    def test_good_relation(self):
        session = self.Session()
        ngram = WiktionaryNgram(ngram="something")
        new_ngram = WiktionaryNgram(ngram="something")

        # Add the group and member
        session.add(ngram)
        session.commit()

        # Shouldn't be able to add duplicate data
        del ngram
        session.add(new_ngram)
        self.assertRaises(IntegrityError, session.commit)
        session.rollback()
        session.close()
Exemple #4
0
def run():
    table_name = os.environ["BATCHPAR_table_name"]
    url = os.environ["BATCHPAR_url"]
    db_name = os.environ["BATCHPAR_db_name"]
    s3_path = os.environ["BATCHPAR_outinfo"]

    # Setup the database connectors
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)
    _class = get_class_by_tablename(Base, table_name)
    Session = try_until_allowed(sessionmaker, engine)
    session = try_until_allowed(Session)

    # Commit the data
    all_pks = set()
    objs = []
    pkey_cols = _class.__table__.primary_key.columns
    for row in iterrows(url):
        if len(row) == 0:
            continue
        if session.query(exists(_class, **row)).scalar():
            continue
        pk = tuple([row[pkey.name] for pkey in pkey_cols])
        if pk in all_pks:
            continue
        all_pks.add(pk)
        objs.append(_class(**row))
    session.bulk_save_objects(objs)
    session.commit()
    session.close()

    # Mark the task as done
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")
Exemple #5
0
def run():
    batch_file = os.environ['BATCHPAR_batch_file']
    db = os.environ['BATCHPAR_db_name']
    bucket = os.environ['BATCHPAR_bucket']

    # database setup
    engine = get_mysql_engine('BATCHPAR_config', 'mysqldb', db)

    # collect data
    target = f"s3://{bucket}/{batch_file}"
    df = pd.read_json(target, orient='records')
    logging.info(f"{len(df)} locations to geocode")

    # append country iso codes and continent
    df = country_iso_code_dataframe(df)
    logging.info("Country ISO codes appended")

    # geocode, appending latitude and longitude columns, using the q= query method
    df = geocode_batch_dataframe(df, query_method='query_only')
    logging.info("Geocoding complete")

    # remove city and country columns and append done column
    df = df.drop(['city', 'country'], axis=1)
    df['done'] = True

    # convert to list of dict and output to database
    rows = df.to_dict(orient='records')

    logging.info(f"Writing {len(rows)} rows to database")
    with db_session(engine) as session:
        session.bulk_update_mappings(Geographic, rows)
    logging.warning("Batch task complete")
Exemple #6
0
def load_arxiv_categories(db_config, db, bucket, cat_file):
    """Loads a file of categories and descriptions into mysql from a csv file on s3.

    Args:
        db_config (str): environmental variable pointing to mysql config file
        db (str): config header to use from the mysql config file
        bucket (str): s3 bucket where the csv is held
        cat_file (str): path to the file on s3
    """
    target = f's3://{bucket}/{cat_file}'
    categories = pd.read_csv(target)

    # Setup the database connectors
    engine = get_mysql_engine(db_config, "mysqldb", db)
    try_until_allowed(Base.metadata.create_all, engine)
    Session = try_until_allowed(sessionmaker, engine)
    session = try_until_allowed(Session)

    logging.info(
        f'found {session.query(Category).count()} existing categories')
    for idx, data in categories.iterrows():
        if not _category_exists(session, data['id']):
            _add_category(session,
                          cat_id=data['id'],
                          description=data['description'])
    session.close()
Exemple #7
0
    def run(self):
        """Apply health labels using model."""
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        try_until_allowed(Base.metadata.create_all, self.engine)

        # collect and unpickle models from s3
        logging.info("Collecting models from S3")
        s3 = boto3.resource('s3')
        vectoriser_obj = s3.Object(self.bucket, self.vectoriser_key)
        vectoriser = pickle.loads(
            vectoriser_obj.get()['Body']._raw_stream.read())
        classifier_obj = s3.Object(self.bucket, self.classifier_key)
        classifier = pickle.loads(
            classifier_obj.get()['Body']._raw_stream.read())

        # retrieve organisations and categories
        nrows = 1000 if self.test else None
        logging.info("Collecting organisations from database")
        with db_session(self.engine) as session:
            orgs = (session.query(Organization.id).filter(
                Organization.is_health.is_(None)).limit(nrows).all())

        for batch_count, batch in enumerate(
                split_batches(orgs, self.insert_batch_size), 1):
            batch_orgs_with_cats = []
            for (org_id, ) in batch:
                with db_session(self.engine) as session:
                    categories = (session.query(
                        OrganizationCategory.category_name).filter(
                            OrganizationCategory.organization_id ==
                            org_id).all())
                # categories should be a list of str, comma separated: ['cat,cat,cat', 'cat,cat']
                categories = ','.join(cat_name for (cat_name, ) in categories)
                batch_orgs_with_cats.append({
                    'id': org_id,
                    'categories': categories
                })

            logging.debug(
                f"{len(batch_orgs_with_cats)} organisations retrieved from database"
            )

            logging.debug("Predicting health flags")
            batch_orgs_with_flag = predict_health_flag(batch_orgs_with_cats,
                                                       vectoriser, classifier)

            logging.debug(
                f"{len(batch_orgs_with_flag)} organisations to update")
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization,
                                             batch_orgs_with_flag)
            logging.info(
                f"{batch_count} batches health labeled and written to db")

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Exemple #8
0
def extract_data(limit=None, db='patstat_2019_05_13'):
    '''Get all EU patents, grouped and aggregated by their doc families'''
    engine = get_mysql_engine('MYSQLDB', 'mysqldb', db)
    session = generate_temp_tables(engine, limit=limit)
    dfs = temp_tables_to_dfs(engine, limit=limit)
    session.close()
    del session
    return concat_dfs(dfs)
Exemple #9
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    logging.info('Retrieving engine connection')
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)

    # es setup
    logging.info('Connecting to ES')
    strans_kwargs = {
        'filename': 'eurito/cordis-eu.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           listify_terms=True,
                           do_sort=False,
                           ngram_fields=['textBody_description_project'])

    # collect file
    logging.info('Retrieving project ids')
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    project_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(project_ids)} project IDs " "retrieved from s3")

    #
    logging.info('Processing rows')
    with db_session(engine) as session:
        for count, obj in enumerate((session.query(Project).filter(
                Project.rcn.in_(project_ids)).all())):
            row = object_to_dict(obj)
            row = reformat_row(row)
            es.index(index=es_index,
                     doc_type=es_type,
                     id=row.pop('rcn'),
                     body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")
Exemple #10
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.info(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # s3 setup
        s3 = boto3.resource('s3')
        intermediate_file = s3.Object(BUCKET, f"mag_estimate_{database}.json")

        eu = get_eu_countries()

        with db_session(self.engine) as session:
            eu_grid_ids = {
                i.id
                for i in (session.query(Institute.id).filter(
                    Institute.country.in_(eu)).all())
            }
            logging.info(f"{len(eu_grid_ids):,} EU institutes in GRID")

        # collect previous and exclude
        try:
            previous = json.loads(
                intermediate_file.get()['Body']._raw_stream.read())

            done_institutes = set(previous['institutes'])
            logging.info(
                f"{len(done_institutes)} previously processed institutes retrieved"
            )
            eu_grid_ids = eu_grid_ids - done_institutes
            logging.info(f"{len(eu_grid_ids)} to process")

            paper_ids = set(previous['paper_ids'])
            logging.info(
                f"{len(paper_ids)} previously processed papers retrieved")
        except ClientError:
            logging.info("Unable to load previous file, starting from scratch")
            done_institutes = set()
            paper_ids = set()

        limit = 100 if self.test else None
        save_every = 50 if self.test else 1000000

        total = count_papers(eu_grid_ids,
                             done_institutes,
                             paper_ids,
                             intermediate_file,
                             save_every=save_every,
                             limit=limit)

        # mark as done
        logging.info("Task complete")
        logging.info(f"Total EU papers found: {total:,}")
        self.output().touch()
Exemple #11
0
 def __init__(self, config_filepath=None, database="dev"):
     if config_filepath is not None:
         os.environ["MYSQLDBCONF"] = config_filepath
     engine = get_mysql_engine("MYSQLDBCONF", "mysqldb", database=database)
     Session = sessionmaker(engine)
     Base.metadata.create_all(engine)
     session = Session()
     # Split out n-grams by size (speeds up the extraction later)
     self.ngrams = defaultdict(set)
     for row in session.query(WiktionaryNgram).all():
         size = row.ngram.count("_") + 1
         self.ngrams[size].add(row.ngram)
Exemple #12
0
 def __enter__(self):
     '''Set up the database connection, session and query stub.'''
     engine = get_mysql_engine("MYSQLDBCONF",
                               "mysqldb",
                               database=self.database)
     engine.execution_options(stream_results=True)
     Session = sessionmaker(engine)
     Base.metadata.create_all(engine)
     self.session = Session()
     self.query_stub = self.session.query(Abstracts).order_by(
         Abstracts.application_id)
     return self
Exemple #13
0
    def run(self):
        db = 'production' if not self.test else 'dev'

        keys = self.get_abstract_file_keys(bucket, key_prefix)
        
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', db)
        with db_session(engine) as session:
            
            if self.test:
                existing_projects = set()
                projects = session.query(Projects.application_id).distinct()
                for p in projects:
                    existing_projects.update(int(p.application_id))
            
            projects_done = set()
            projects_mesh = session.query(ProjectMeshTerms.project_id).distinct()
            for p in projects_mesh:
                projects_done.update(int(p.project_id))
            
            mesh_term_ids = {int(m.id) for m in session.query(MeshTerms.id).all()}

        logging.info('Inserting associations')
        
        for key_count, key in enumerate(keys):
            if self.test and (key_count > 2):
                continue
            # collect mesh results from s3 file and groups by project id
            # each project id has set of mesh terms and corresponding term ids
            df_mesh = retrieve_mesh_terms(bucket, key)
            project_terms = self.format_mesh_terms(df_mesh)
            # go through documents
            for project_count, (project_id, terms) in enumerate(project_terms.items()):
                rows = []
                if self.test and (project_count > 2):
                    continue
                if (project_id in projects_done) or (project_id not in existing_projects):
                    continue

                for term, term_id in zip(terms['terms'], terms['ids']):
                    term_id = int(term_id)
                    # add term to mesh term table if not present
                    if term_id not in mesh_term_ids:
                        objs = insert_data(
                                self.db_config_env, 'mysqldb', db, Base, MeshTerms, 
                                [{'id': term_id, 'term': term}], low_memory=True)
                        mesh_term_ids.update({term_id})
                    # prepare row to be added to project-mesh_term link table
                    rows.append({'project_id': project_id, 'mesh_term_id': term_id})
                # inesrt rows to link table
                insert_data(self.db_config_env, 'mysqldb', db, Base, 
                        ProjectMeshTerms, rows, low_memory=True)
        self.output().touch() # populate project-mesh_term link table
Exemple #14
0
    def run(self):
        """Collect and process organizations, categories and long descriptions."""

        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        try_until_allowed(Base.metadata.create_all, self.engine)
        limit = 2000 if self.test else None
        batch_size = 30 if self.test else 1000

        with db_session(self.engine) as session:
            all_orgs = session.query(
                Organisation.id, Organisation.addresses).limit(limit).all()
            existing_org_location_ids = session.query(
                OrganisationLocation.id).all()
        logging.info(f"{len(all_orgs)} organisations retrieved from database")
        logging.info(
            f"{len(existing_org_location_ids)} organisations have previously been processed"
        )

        # convert to a list of dictionaries with the nested addresses unpacked
        orgs = get_orgs_to_process(all_orgs, existing_org_location_ids)
        logging.info(f"{len(orgs)} new organisations to geocode")

        total_batches = ceil(len(orgs) / batch_size)
        logging.info(f"{total_batches} batches")
        completed_batches = 0
        for batch in split_batches(orgs, batch_size=batch_size):
            # geocode first to add missing country for UK
            batch = map(geocode_uk_with_postcode, batch)
            batch = map(add_country_details, batch)

            # remove data not in OrganisationLocation columns
            org_location_cols = OrganisationLocation.__table__.columns.keys()
            batch = [{k: v
                      for k, v in org.items() if k in org_location_cols}
                     for org in batch]

            insert_data(self.db_config_env, 'mysqldb', database, Base,
                        OrganisationLocation, batch)
            completed_batches += 1
            logging.info(
                f"Completed {completed_batches} of {total_batches} batches")

            if self.test and completed_batches > 1:
                logging.warning("Breaking after 2 batches in test mode")
                break

        # mark as done
        logging.warning("Finished task")
        self.output().touch()
Exemple #15
0
    def run(self):
        limit = 100 if self.test else None
        flush_freq = 33 if self.test else 5000

        # Get connection settings
        engine = get_mysql_engine('MYSQLDB', 'nesta',
                                  'dev' if self.test else 'production')
        conf = get_config('neo4j.config', 'neo4j')
        gkwargs = dict(host=conf['host'], secure=True,
                       auth=(conf['user'], conf['password']))

        # Drop all neo4j data in advance
        # (WARNING: this is a hack in lieu of proper db staging/versioning)
        with graph_session(**gkwargs) as tx:
            logging.info('Dropping all previous data')
            tx.graph.delete_all()
            for constraint in tx.run('CALL db.constraints'):
                logging.info(f'Dropping constraint {constraint[0]}')
                tx.run(f'DROP {constraint[0]}')

        # Iterate over all tables in the ORM
        for tablename, table in Base.metadata.tables.items():
            entity_name = _extract_name(tablename)
            logging.info(f'\tProcessing {entity_name}')
            orm, parent_orm, rel_name = prepare_base_entities(table)
            # Insert data to neo4j in one session per table,
            # to enable constraint and relationship lookups
            # after insertion            
            irow = 0
            uninterrupted = False
            while not uninterrupted:
                uninterrupted = True
                with graph_session(**gkwargs) as tx:
                    # Iterate over rows in the database
                    for db, orm_instance in db_session_query(query=orm,
                                                             engine=engine,
                                                             limit=limit, 
                                                             offset=irow):
                        irow += 1
                        if irow == limit:
                            break
                        # Convert the ORM row to a neo4j object, and insert
                        orm_to_neo4j(session=db, transaction=tx,
                                     orm_instance=orm_instance,
                                     parent_orm=parent_orm,
                                     rel_name=rel_name)
                        if (irow % flush_freq) == 0:
                            logging.info(f'\t\tFlushing at row {irow}')
                            uninterrupted = False
                            break
        # Confirm the task is finished
        self.output().touch()
Exemple #16
0
def run():
    batch_file = os.environ['BATCHPAR_batch_file']
    bucket = os.environ['BATCHPAR_bucket']
    db_name = os.environ['BATCHPAR_db_name']
    db_env = "BATCHPAR_config"
    db_section = "mysqldb"

    # Setup the database connectors
    engine = get_mysql_engine(db_env, db_section, db_name)
    try_until_allowed(Base.metadata.create_all, engine)

    # Retrieve RCNs to iterate over
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    all_rcn = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(all_rcn)} project RCNs retrieved from s3")

    # Retrieve all topics
    data = defaultdict(list)
    for i, rcn in enumerate(all_rcn):
        logging.info(i)
        project, orgs, reports, pubs = fetch_data(rcn)
        if project is None:
            continue
        _topics = project.pop('topics')
        _calls = project.pop('proposal_call')
        # NB: Order below matters due to FK constraints!
        data['projects'].append(project)
        data['reports'] += prepare_data(reports, rcn)
        data['publications'] += prepare_data(pubs, rcn)
        data['organisations'] += extract_core_orgs(orgs, rcn)
        data['project_organisations'] += prepare_data(orgs, rcn)
        for topics, project_topics in split_links(_topics, rcn):
            data['topics'].append(topics)
            data['project_topics'].append(project_topics)
        for calls, project_calls in split_links(_calls, rcn):
            data['proposal_calls'].append(calls)
            data['project_proposal_calls'].append(project_calls)

    # Pipe the data to the db
    for table_prefix, rows in data.items():
        table_name = f'cordis_{table_prefix}'
        logging.info(table_name)
        _class = get_class_by_tablename(Base, table_name)
        insert_data(db_env,
                    db_section,
                    db_name,
                    Base,
                    _class,
                    rows,
                    low_memory=True)
Exemple #17
0
    def run(self):
        '''Extract the topics of interest'''
        database = 'dev' if self.test else 'production'
        engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        members_limit = get_members_by_percentile(engine,
                                                  perc=self.members_perc)
        topics = get_core_topics(engine,
                                 core_categories=self.core_categories,
                                 members_limit=members_limit,
                                 perc=self.topic_perc)

        # Write the intermediate output
        with self.output().open('wb') as outstream:
            outstream.write(json.dumps(list(topics)).encode('utf8'))
Exemple #18
0
    def prepare(self):
        # mysql setup
        db = 'production' if not self.test else 'dev'
        engine = get_mysql_engine(MYSQLDB_ENV, "mysqldb", db)
        Session = sessionmaker(bind=engine)
        session = Session()
        project_query = session.query(Projects)

        # elasticsearch setup
        es_mode = 'dev' if self.test else 'prod'
        es, es_config = setup_es(es_mode,
                                 self.test,
                                 self.drop_and_recreate,
                                 dataset='nih',
                                 aliases='health_scanner')

        batches = self.batch_limits(project_query, BATCH_SIZE)
        job_params = []
        for start, end in batches:
            params = {
                'start_index':
                start,
                'end_index':
                end,
                'config':
                "mysqldb.config",
                'db':
                db,
                'outinfo':
                es_config['host'],
                'out_port':
                es_config['port'],
                'out_index':
                es_config['index'],
                'out_type':
                es_config['type'],
                'aws_auth_region':
                es_config['region'],
                'done':
                es.exists(index=es_config['index'],
                          doc_type=es_config['type'],
                          id=end),
                'aws_auth_region':
                es_config['region'],
                'entity_type':
                'paper'
            }
            job_params.append(params)
        return job_params
Exemple #19
0
class TestPatstat(unittest.TestCase):
    '''Check that the Patstat EU ORM works as expected'''
    engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")
    Session = sessionmaker(engine)

    def setUp(self):
        '''Create the temporary table'''
        Base.metadata.create_all(self.engine)

    def tearDown(self):
        '''Drop the temporary table'''
        Base.metadata.drop_all(self.engine)

    def test_build(self):
        pass
Exemple #20
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])  # example parameter
    s3_path = os.environ["BATCHPAR_outinfo"]
    start_string = os.environ["BATCHPAR_start_string"],  # example parameter
    offset = int(os.environ["BATCHPAR_offset"])

    # reduce records in test mode
    if test:
        limit = 50
        logging.info(f"Limiting to {limit} rows in test mode")
    else:
        limit = batch_size

    logging.info(f"Processing {offset} - {offset + limit}")

    # database setup
    logging.info(f"Using {db_name} database")
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)

    with db_session(engine) as session:
        # consider moving this query and the one from the prepare step into a package
        batch_records = (session.query(MyTable.id, MyTable.name).filter(
            MyTable.founded_on > '2007-01-01').offset(offset).limit(limit))

    # process and insert data
    processed_batch = []
    for row in batch_records:
        processed_row = some_func(start_string=start_string, row=row)
        processed_batch.append(processed_row)

    logging.info(f"Inserting {len(processed_batch)} rows")
    insert_data("BATCHPAR_config",
                'mysqldb',
                db_name,
                Base,
                MyOtherTable,
                processed_batch,
                low_memory=True)

    logging.info(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.info("Batch job complete.")
Exemple #21
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # collect mesh terms from S3
        bucket = 'innovation-mapping-general'
        key = 'crunchbase_descriptions/crunchbase_descriptions_mesh.txt'
        mesh_terms = retrieve_mesh_terms(bucket, key)
        mesh_terms = format_mesh_terms(
            mesh_terms)  # [{'id': ['term1', 'term2']}, ...]
        logging.info(f"File contains {len(mesh_terms)} orgs with mesh terms")

        logging.info("Extracting previously processed orgs")
        with db_session(self.engine) as session:
            all_orgs = session.query(Organization.id,
                                     Organization.mesh_terms).all()
        processed_orgs = {
            org_id
            for (org_id, mesh_terms) in all_orgs if mesh_terms is not None
        }
        all_orgs = {org_id for (org_id, _) in all_orgs}
        logging.info(f"{len(all_orgs)} total orgs in database")
        logging.info(f"{len(processed_orgs)} previously processed orgs")

        # reformat for batch insert, removing not found and previously processed terms
        meshed_orgs = [{
            'id': org_id,
            'mesh_terms': '|'.join(terms)
        } for org_id, terms in mesh_terms.items()
                       if org_id in all_orgs and org_id not in processed_orgs]

        logging.info(f"{len(meshed_orgs)} organisations to update in database")

        for count, batch in enumerate(
                split_batches(meshed_orgs, self.insert_batch_size), 1):
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization, batch)
            logging.info(
                f"{count} batch{'es' if count > 1 else ''} written to db")
            if self.test and count > 1:
                logging.info("Breaking after 2 batches while in test mode")
                break

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)

        # collect file
        logging.info(f"Collecting org_parents from crunchbase tar")
        org_parents = get_files_from_tar(['org_parents'])[0]
        logging.info(f"{len(org_parents)} parent ids in crunchbase export")

        # collect previously processed orgs
        logging.info("Extracting previously processed organisations")
        with db_session(self.engine) as session:
            processed_orgs = session.query(Organization.id,
                                           Organization.parent_id).all()
        all_orgs = {org for (org, _) in processed_orgs}
        logging.info(f"{len(all_orgs)} total orgs in database")
        processed_orgs = {
            org
            for (org, parent_id) in processed_orgs if parent_id is not None
        }
        logging.info(f"{len(processed_orgs)} previously processed orgs")

        # reformat into a list of dicts, removing orgs that already have a parent_id
        # or are missing from the database
        org_parents = org_parents[['uuid', 'parent_uuid']]
        org_parents.columns = ['id', 'parent_id']
        org_parents = org_parents[org_parents['id'].isin(all_orgs)]
        org_parents = org_parents[~org_parents['id'].isin(processed_orgs)]
        org_parents = org_parents.to_dict(orient='records')
        logging.info(f"{len(org_parents)} organisations to update in MYSQL")

        # insert parent_ids into db in batches
        for count, batch in enumerate(
                split_batches(org_parents, self.insert_batch_size), 1):
            with db_session(self.engine) as session:
                session.bulk_update_mappings(Organization, batch)
            logging.info(
                f"{count} batch{'es' if count > 1 else ''} written to db")
            if self.test and count > 1:
                logging.info("Breaking after 2 batches while in test mode")
                break

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Exemple #23
0
    def prepare(self):
        """Prepare the batch job parameters"""
        # database setup
        database = 'dev' if self.test else 'production'
        logging.info(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        if self.test and database == 'dev':
            logging.warning('Dropping tables')
            Base.metadata.drop_all(self.engine)
        try_until_allowed(Base.metadata.create_all, self.engine)

        with db_session(self.engine) as session:
            if self.test:
                logging.info("Adding test data")
                test_data = []
                for i in range(1000):
                    test_data.append(MyTable(id=i, founded_on='2009-01-01'))
                session.add_all(test_data)
                session.commit()

            logging.info('Retrieving list of records to process')
            total_records = (session
                             .query(MyTable.id)
                             .filter(MyTable.founded_on > '2007-01-01')
                             .count())

        job_params = []  # dictionaries of environmental variables for each batch
        # potential method of generating batches:
        for count, offset in enumerate(range(0, total_records, self.batch_size)):
            key = f"{self.date}_batch_{offset}_{database}"
            done = key in DONE_KEYS
            params = {"config": "mysqldb.config",
                      "db_name": database,
                      "test": self.test,
                      "outinfo": f"s3://{self.intermediate_bucket}/{key}",
                      "done": done,
                      "batch_size": self.batch_size,  # example parameter
                      "start_string": self.start_string,  # example parameter
                      "offset": offset}
            job_params.append(params)
            logging.info(params)

            if self.test and count == TEST_BATCHES:
                logging.info(f"Only {TEST_BATCHES} batches created in test mode")
                break

        return job_params
Exemple #24
0
    def test_object_to_dict(self):
        parents = [{
            "_id": 10,
            "_another_id": 2,
            "some_field": 20
        }, {
            "_id": 20,
            "_another_id": 2,
            "some_field": 20
        }]
        _parents = insert_data("MYSQLDBCONF", "mysqldb", "production_tests",
                               Base, DummyModel, parents)
        assert len(parents) == len(_parents)

        children = [{
            "_id": 10,
            "parent_id": 10
        }, {
            "_id": 10,
            "parent_id": 20
        }, {
            "_id": 20,
            "parent_id": 20
        }, {
            "_id": 30,
            "parent_id": 20
        }]
        _children = insert_data("MYSQLDBCONF", "mysqldb", "production_tests",
                                Base, DummyChild, children)
        assert len(children) == len(_children)

        # Re-retrieve parents from the database
        found_children = set()
        engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")
        with db_session(engine) as session:
            for p in session.query(DummyModel).all():
                row = object_to_dict(p)
                assert type(row) is dict
                assert len(row['children']) > 0
                _found_children = set(
                    (c['_id'], c['parent_id']) for c in row['children'])
                found_children = found_children.union(_found_children)
                _row = object_to_dict(p, shallow=True)
                assert 'children' not in _row
                del row['children']
                assert row == _row
            assert len(found_children) == len(children) == len(_children)
Exemple #25
0
class TestMeetup(unittest.TestCase):
    '''Currently just a placeholder test to check that the schema compiles'''
    engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")
    Session = sessionmaker(engine)

    def setUp(self):
        '''Create the temporary table'''
        Base.metadata.create_all(self.engine)

    def tearDown(self):
        '''Drop the temporary table'''
        Base.metadata.drop_all(self.engine)        

    def test_constraints(self):
        '''Placeholder for if any contraints are added'''
        session = self.Session()
        session.close()
Exemple #26
0
class TestRun(TestCase):

    engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")
    Session = sessionmaker(engine)

    def setUp(self):
        '''Create the temporary table'''
        Base.metadata.create_all(self.engine)

    def tearDown(self):
        '''Drop the temporary table'''
        Base.metadata.drop_all(self.engine)

    @mock.patch.dict(os.environ, environ)
    @mock.patch('nesta.core.batchables.meetup.members_groups.run.boto3')
    def test_members_groups(self, boto3):
        n = members_groups.run.run()
        self.assertGreater(n, 0)
Exemple #27
0
    def prepare(self):
        """Copies any new city/county combinations from the input table into the
        geographic_data table. All rows which have previously not been processed will
        be split into batches.

        Returns:
            (:obj:`list` of :obj:`dict`) job parameters for each of the batch tasks
        """
        # set up database connectors
        self.database = 'dev' if self.test else 'production'
        self.engine = get_mysql_engine(self.db_config_env, "mysqldb",
                                       self.database)
        try_until_allowed(Base.metadata.create_all, self.engine)

        # s3 setup
        self.s3 = boto3.resource('s3')

        # identify new locations in the input table and copy them to the geographic table
        if self.location_key_col is not None:
            self._insert_new_locations()
        else:
            self._insert_new_locations_no_id()

        # create batches from all locations which have not previously been coded
        job_params = []
        uncoded_locations = self._get_uncoded()
        if uncoded_locations:
            for batch_file in self._create_batches(uncoded_locations):
                params = {
                    "batch_file": batch_file,
                    "config": 'mysqldb.config',
                    "db_name": self.database,
                    "bucket": self.intermediate_bucket,
                    "done": False,
                    "outinfo": '',
                    "test": self.test
                }
                job_params.append(params)
                logging.info(params)
            logging.info(f"{len(job_params)} batches to run")
        else:
            logging.warning(f"no new locations to geocode")

        return job_params
Exemple #28
0
    def test_db_session_query(self):
        parents = [{
            "_id": i,
            "_another_id": i,
            "some_field": 20
        } for i in range(0, 1000)]
        _parents = insert_data("MYSQLDBCONF", "mysqldb", "production_tests",
                               Base, DummyModel, parents)

        # Re-retrieve parents from the database
        engine = get_mysql_engine("MYSQLDBCONF", "mysqldb")

        # Test for limit = 3
        limit = 3
        old_db = mock.MagicMock()
        old_db.is_active = False
        n_rows = 0
        for db, row in db_session_query(query=DummyModel,
                                        engine=engine,
                                        chunksize=10,
                                        limit=limit):
            assert type(row) is DummyModel
            if old_db != db:
                assert len(old_db.transaction._connections) == 0
                assert len(db.transaction._connections) > 0
            old_db = db
            n_rows += 1
        assert n_rows == limit

        # Test for limit = None
        old_db = mock.MagicMock()
        old_db.is_active = False
        n_rows = 0
        for db, row in db_session_query(query=DummyModel,
                                        engine=engine,
                                        chunksize=100,
                                        limit=None):
            assert type(row) is DummyModel
            if old_db != db:
                assert len(old_db.transaction._connections) == 0
                assert len(db.transaction._connections) > 0
            old_db = db
            n_rows += 1
        assert n_rows == len(parents) == 1000
Exemple #29
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    table = os.environ["BATCHPAR_table"]
    batch_size = int(os.environ["BATCHPAR_batch_size"])
    s3_path = os.environ["BATCHPAR_outinfo"]

    logging.warning(f"Processing {table} file")

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    try_until_allowed(Base.metadata.create_all, engine)
    table_name = f"crunchbase_{table}"
    table_class = get_class_by_tablename(Base, table_name)

    # collect file
    nrows = 1000 if test else None
    df = get_files_from_tar([table], nrows=nrows)[0]
    logging.warning(f"{len(df)} rows in file")

    # get primary key fields and set of all those already existing in the db
    pk_cols = list(table_class.__table__.primary_key.columns)
    pk_names = [pk.name for pk in pk_cols]
    with db_session(engine) as session:
        existing_rows = set(session.query(*pk_cols).all())

    # process and insert data
    processed_rows = process_non_orgs(df, existing_rows, pk_names)
    for batch in split_batches(processed_rows, batch_size):
        insert_data("BATCHPAR_config",
                    'mysqldb',
                    db_name,
                    Base,
                    table_class,
                    processed_rows,
                    low_memory=True)

    logging.warning(f"Marking task as done to {s3_path}")
    s3 = boto3.resource('s3')
    s3_obj = s3.Object(*parse_s3_path(s3_path))
    s3_obj.put(Body="")

    logging.warning("Batch job complete.")
Exemple #30
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    db_name = os.environ["BATCHPAR_db_name"]
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']
    outinfo = os.environ["BATCHPAR_outinfo"]
    output_bucket = 'clio-text2vec'

    # reduce records in test mode
    if test:
        limit = 50
        logging.info(f"Limiting to {limit} rows in test mode")
    else:
        limit = None
    # database setup
    logging.info(f"Using {db_name} database")

    # Get IDs from S3
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(ids)} article IDs retrieved from s3")

    # Connect to SQL
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    with db_session(engine) as session:
        batch_records = (
            session.query(Projects.abstractText).filter(Projects.id.in_(ids))
            # .limit(limit)
            .all())

    # Process and insert data
    vectors = docs2vectors([batch.abstractText for batch in batch_records])
    processed_batch = {
        id_: vector.tolist()
        for id_, vector in zip(ids, vectors)
    }
    logging.info(f"Inserting {len(processed_batch)} rows")

    # Store batched vectors in S3
    s3 = boto3.resource('s3')
    obj = s3.Object(output_bucket, f'{outinfo}.json')
    obj.put(Body=json.dumps(processed_batch))