def output(self): '''Points to the output database engine''' db_config = misctools.get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if self.production else "dev" db_config["table"] = "UK Geography Lookup (dummy) " update_id = db_config["table"] + str(self.date) return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the input database target''' update_id = "NihProcessData-%s" % self._routine_id db_config = misctools.get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if not self.test else "dev" db_config["table"] = "NIH process DUMMY" # Note, not a real table return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_config = misctools.get_config(self.db_config_path, "mysqldb") db_config["database"] = "production" if not self.test else "dev" db_config["table"] = "NIH <dummy>" # Note, not a real table update_id = "NihCollectData_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_config = misctools.get_config(self.db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "MAG <dummy>" # Note, not a real table update_id = "MagCollectSparql_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def process_config(conf_prefix, test=False): """Fetch a NOMIS dataset from the API based on a configuration file. Args: conf_prefix (str): Configuration file name prefix, such that a configuration file exists in the global config file directory (see :obj:`get_config`) of the form 'official_data/{conf_prefix}.config' Returns: df (:obj:`pd.DataFrame`): Dataframe containing NOMIS data. """ # Get the configuration config = get_config(f"official_data/{conf_prefix}.config", "nomis") #logging.debug("\tGot config") dataset_id = config.pop("dataset") date_format = config.pop("date_format") # Iterate over NOMIS geography codes for this dataset geogs_list = [] for geo_type in config.pop("geography_type").split(","): #logging.debug(f"\t{geo_type}") geographies = find_geographies(geo_type, dataset_id) if test: geographies = [geographies[0]] #logging.debug(f"\tGot {len(geographies)} geographies") geogs_list.append(geographies) return config, geogs_list, dataset_id, date_format
def requires(self): logging.getLogger().setLevel(logging.INFO) db_config = misctools.get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if self.production else "dev" db_config["table"] = "worldbank_countries" variable_codes = [ "SP.RUR.TOTL.ZS", "SP.URB.TOTL.IN.ZS" "SP.POP.DPND", "SP.POP.TOTL", "SP.DYN.LE00.IN", "SP.DYN.IMRT.IN", "BAR.NOED.25UP.ZS", "BAR.TER.CMPT.25UP.ZS", "NYGDPMKTPSAKD", "SI.POV.NAHC", "SI.POV.GINI" ] job_name = (f"Worldbank-{self.date}-" f"{'_'.join(variable_codes).replace('.','_')}-" f"{self.production}")[0:120] yield WorldbankTask( date=self.date, db_config=db_config, variable_codes=variable_codes, batchable=find_filepath_from_pathstub( "core/batchables/collect_worldbank/"), env_files=[ find_filepath_from_pathstub("/nesta/nesta"), find_filepath_from_pathstub("/config/mysqldb.config") ], job_def="py36_amzn1_image", job_name=job_name, job_queue="HighPriority", region_name="eu-west-2", poll_time=10, max_live_jobs=200, test=(not self.production))
def output(self): '''Points to the output database engine''' db_config = misctools.get_config(self.db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "arXlive topics <dummy>" # Note, not a real table update_id = "ArxivTopicTask_{}_{}".format(self.date, self.test) return MySqlTarget(update_id=update_id, **db_config)
def output(self): """Points to the output database engine where the task is marked as done.""" db_config = get_config(os.environ["MYSQLDB"], "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "Example <dummy>" # Note, not a real table update_id = "MyTaskWhichNeedsAName_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def requires(self): db_config = misctools.get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if self.production else "dev" db_config["table"] = "wiktionary_ngrams" yield CollectNgramTask(date=self.date, db_config=db_config, test=not self.production)
def output(self): '''Points to the DB target''' update_id = "meetup_group_details-%s" % self._routine_id db_config = get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if not self.test else "dev" db_config["table"] = "meetup_groups" return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_config = get_config(self.db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "arXlive <dummy>" # NB: not a real table update_id = "ArxivAnalysis_{}_{}".format(self.date, self.test) return mysqldb.MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_conf = get_config(self.db_config_path, "mysqldb") db_conf["database"] = 'dev' if self.test else 'production' db_conf["table"] = "CordisCollect <dummy>" # not a real table update_id = self.job_name return MySqlTarget(update_id=update_id, **db_conf)
def output(self): '''Points to the output database engine''' db_config = get_config(os.environ[self.db_config_env], "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "Crunchbase <dummy>" # Note, not a real table update_id = "CrunchbaseGeocodeFundingRound_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' self.db_config_path = os.environ[self.db_config_env] db_config = get_config(self.db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = f"{self.routine_id} <dummy>" # Not a real table update_id = f"{self.routine_id}_{self.date}" return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_config = misctools.get_config(self.db_config_path, "mysqldb") db_config["database"] = ("production" if not self.test else "dev") db_config["table"] = "es2es <dummy>" # NB, not a real tbl update_id = "Es2Es_{}_{}_{}".format(self.date, self.origin_index, self.test) return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_config = get_config(os.environ[self.db_config_env], "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config[ "table"] = f"BatchGeocode{self._routine_id} <dummy>" # Note, not a real table return MySqlTarget(update_id=f"BatchGeocode-{self._routine_id}", **db_config)
def output(self): '''Points to the output database engine''' db_config = misctools.get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if self.production else "dev" db_config["table"] = "nomis (dummy) " update_id = (f"{db_config['table']} {self.date} " f"{self.config_name} {self.production}") return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_config_path = os.environ['MYSQLDB'] db_config = misctools.get_config(db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "EURITO_patstat_pre" # Note, not a real table update_id = "EURITO_patstat_pre_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def output(self): """Points to the output database engine where the task is marked as done. - For luigi updates table""" db_config_path = os.environ[self.db_config_env] db_config = misctools.get_config(db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "text2vec <dummy>" # Note, not a real table update_id = "text2vectors_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' db_config = get_config('mysqldb.config', "mysqldb") db_config["database"] = "production" if self.production else "dev" db_config[ "table"] = f"Clio{self.dataset} <dummy>" # Note, not a real table update_id = f"Clio{self.dataset}_{self.date}" return MySqlTarget(update_id=update_id, **db_config)
def output(self): """Points to the output database engine""" self.db_config_path = os.environ[self.db_config_env] db_config = get_config(self.db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "Crunchbase <dummy>" # Note, not a real table update_id = "CrunchbaseCollectOrgData_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine where the task is marked as done. The luigi_table_updates table exists in test and production databases. ''' db_config = get_config(os.environ["MYSQLDB"], 'mysqldb') db_config["database"] = 'dev' if self.test else 'production' db_config["table"] = "Example <dummy>" # Note, not a real table update_id = "SimpleTask_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def output(self): '''Points to the output database engine''' self.db_config_path = os.environ[self.db_config_env] db_config = get_config(self.db_config_path, "mysqldb") db_config["database"] = 'dev' if self.test else 'production' db_config[ "table"] = "Crunchbase to Elasticsearch <dummy>" # Note, not a real table update_id = "CrunchbaseToElasticsearch_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def run(self): limit = 100 if self.test else None flush_freq = 33 if self.test else 5000 # Get connection settings engine = get_mysql_engine('MYSQLDB', 'nesta', 'dev' if self.test else 'production') conf = get_config('neo4j.config', 'neo4j') gkwargs = dict(host=conf['host'], secure=True, auth=(conf['user'], conf['password'])) # Drop all neo4j data in advance # (WARNING: this is a hack in lieu of proper db staging/versioning) with graph_session(**gkwargs) as tx: logging.info('Dropping all previous data') tx.graph.delete_all() for constraint in tx.run('CALL db.constraints'): logging.info(f'Dropping constraint {constraint[0]}') tx.run(f'DROP {constraint[0]}') # Iterate over all tables in the ORM for tablename, table in Base.metadata.tables.items(): entity_name = _extract_name(tablename) logging.info(f'\tProcessing {entity_name}') orm, parent_orm, rel_name = prepare_base_entities(table) # Insert data to neo4j in one session per table, # to enable constraint and relationship lookups # after insertion irow = 0 uninterrupted = False while not uninterrupted: uninterrupted = True with graph_session(**gkwargs) as tx: # Iterate over rows in the database for db, orm_instance in db_session_query(query=orm, engine=engine, limit=limit, offset=irow): irow += 1 if irow == limit: break # Convert the ORM row to a neo4j object, and insert orm_to_neo4j(session=db, transaction=tx, orm_instance=orm_instance, parent_orm=parent_orm, rel_name=rel_name) if (irow % flush_freq) == 0: logging.info(f'\t\tFlushing at row {irow}') uninterrupted = False break # Confirm the task is finished self.output().touch()
def run(self): # Get connection settings conf = get_config('neo4j.config', 'neo4j') gkwargs = dict(host=conf['host'], secure=True, auth=(conf['user'], conf['password'])) igr = ig.Graph() with graph_session(**gkwargs) as tx: graph = tx.graph logging.info('getting relationships list') all_rels = list(graph.relationships.match().limit( 30000)) #finds all relationships in a graph #create tuple list (edgelist) logging.info("found %d relationships" % len(all_rels)) tuplelist = list() for index, rel in enumerate(all_rels): if index % 1000 == 0: print(index) #what is a better way of changing the main graph start_index, igr = centrality_utils.get_index( rel.start_node, graph, igr) target_index, igr = centrality_utils.get_index( rel.end_node, graph, igr) rel_tuple = (start_index, target_index) tuplelist.append(rel_tuple) igr.add_edges(tuplelist) density = igr.density(loops=False) logging.info("density:", density) betw = igr.betweenness(vertices=None, directed=False, cutoff=3, weights=None, nobigint=True) logging.info("betweenness:", betw) centrality_utils.add_betw_property(graph, igr, betw) logging.debug('Writing to DB complete') # mark as done logging.info("Task complete") self.output().touch()
def requires(self): '''Collects the database configurations and executes the central task.''' db_config = misctools.get_config("mysqldb.config", "mysqldb") db_config["database"] = "dev" # Prepare the input DB config in_db_config = db_config.copy() in_db_config["table"] = "muppets_input" # Prepare the output DB config out_db_config = db_config.copy() out_db_config["table"] = "muppets_output" yield SomeTask(date=self.date, max_age=40, in_db_config=in_db_config, out_db_config=out_db_config)
def run(self): pp = pprint.PrettyPrinter(indent=4, width=100) mag_config = misctools.get_config(self.mag_config_path, 'mag') mag_subscription_key = mag_config['subscription_key'] # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) Base.metadata.create_all(self.engine) with db_session(self.engine) as session: paper_fields = [ "Id", "Ti", "F.FId", "CC", "AA.AuN", "AA.AuId", "AA.AfN", "AA.AfId", "AA.S" ] author_mapping = { 'AuN': 'author_name', 'AuId': 'author_id', 'AfN': 'author_affiliation', 'AfId': 'author_affiliation_id', 'S': 'author_order' } field_mapping = { 'Id': 'mag_id', 'Ti': 'title', 'F': 'fields_of_study', 'AA': 'mag_authors', 'CC': 'citation_count', 'logprob': 'mag_match_prob' } logging.info( "Querying database for articles without fields of study") arxiv_ids_to_process = { a.id for a in (session.query(Article).filter( ~Article.fields_of_study.any()).all()) } total_arxiv_ids_to_process = len(arxiv_ids_to_process) logging.info(f"{total_arxiv_ids_to_process} articles to process") all_articles_to_update = BatchWriter(self.insert_batch_size, update_existing_articles, self.engine) batched_titles = BatchedTitles(arxiv_ids_to_process, 10000, session) batch_field_of_study_ids = set() for count, expr in enumerate(build_expr(batched_titles, 'Ti'), 1): logging.debug(pp.pformat(expr)) expr_length = len(expr.split(',')) logging.info(f"Querying MAG for {expr_length} titles") total_arxiv_ids_to_process -= expr_length batch_data = query_mag_api(expr, paper_fields, mag_subscription_key) logging.debug(pp.pformat(batch_data)) returned_entities = batch_data['entities'] logging.info( f"{len(returned_entities)} entities returned from MAG (potentially including duplicates)" ) # dedupe response keeping the entity with the highest logprob deduped_mag_ids = dedupe_entities(returned_entities) logging.info( f"{len(deduped_mag_ids)} entities after deduplication") missing_articles = expr_length - len(deduped_mag_ids) if missing_articles != 0: logging.info(f"{missing_articles} titles not found in MAG") batch_article_data = [] for row in returned_entities: # exclude duplicate titles if row['Id'] not in deduped_mag_ids: continue # renaming and reformatting for code, description in field_mapping.items(): try: row[description] = row.pop(code) except KeyError: pass for author in row.get('mag_authors', []): for code, description in author_mapping.items(): try: author[description] = author.pop(code) except KeyError: pass if row.get('citation_count', None) is not None: row['citation_count_updated'] = date.today() # reformat fos_ids out of dictionaries try: row['fields_of_study'] = { f['FId'] for f in row.pop('fields_of_study') } except KeyError: row['fields_of_study'] = [] batch_field_of_study_ids.update(row['fields_of_study']) # get list of ids which share the same title try: matching_articles = batched_titles[row['title']] except KeyError: logging.warning( f"Returned title not found in original data: {row['title']}" ) continue # drop unnecessary fields for f in ['prob', 'title']: del row[f] # add each matching article for this title to the batch for article_id in matching_articles: batch_article_data.append({**row, 'id': article_id}) # check fields of study are in database batch_field_of_study_ids = { fos_id for article in batch_article_data for fos_id in article['fields_of_study'] } logging.debug('Checking fields of study exist in db') found_fos_ids = { fos.id for fos in (session.query(FieldOfStudy).filter( FieldOfStudy.id.in_(batch_field_of_study_ids)).all()) } missing_fos_ids = batch_field_of_study_ids - found_fos_ids if missing_fos_ids: # query mag for details if not found update_field_of_study_ids(mag_subscription_key, session, missing_fos_ids) # add this batch to the queue all_articles_to_update.extend(batch_article_data) logging.info( f"Batch {count} done. {total_arxiv_ids_to_process} articles left to process" ) if self.test and count == 2: logging.warning("Exiting after 2 batches in test mode") break # pick up any left over in the batch if all_articles_to_update: all_articles_to_update.write() # mark as done logging.warning("Task complete") self.output().touch()
def output(self): '''Points to the input database target''' db_config = get_config("mysqldb.config", "mysqldb") db_config["database"] = "production" if not self.test else "dev" db_config["table"] = "gtr_table" return MySqlTarget(update_id=self.job_name, **db_config)
def output(self): db_config = get_config(os.environ[self.db_config_env], "mysqldb") db_config['database'] = 'dev' if self.test else 'production' db_config['table'] = "MeshTerms <dummy>" update_id = "NihJoinMeshTerms_{}".format(self.date) return MySqlTarget(update_id=update_id, **db_config)
def run(self): """Write data to ElasticSearch if required""" if not self.write_es: return self.cherry_picked = (f'gtr/{self.date}/'.encode('utf-8') + b'COREX_TOPIC_MODEL.n_hidden_140-0.' b'VECTORIZER.binary_True.' b'min_df_0-001.' b'text_field_abstractText' b'.NGRAM.TEST_False.json') if self.cherry_picked is None: # Read the topics data file_ptr = self.input().open("rb") path = file_ptr.read() file_ptr.close() else: path = self.cherry_picked file_io_topics = s3.S3Target( f's3://clio-data/{path.decode("utf-8")}').open("rb") topic_json = json.load(file_io_topics) file_io_topics.close() topic_lookup = topic_json['data']['topic_names'] topic_json = {row['id']: row for row in topic_json['data']['rows']} # Read the raw data file_io_input = s3.S3Target(self.s3_path_in).open("rb") dirty_json = json.load(file_io_input) file_io_input.close() uid, cleaned_json, fields = clean(dirty_json, self.dataset) # Assign topics n_topics, n_found = 0, 0 for row in cleaned_json: id_ = row[f'id_of_{self.dataset}'] if id_ not in topic_json: continue topics = [ k for k, v in topic_json[id_].items() if k != 'id' and v >= 0.2 ] n_found += 1 if len(topics) > 0: n_topics += 1 row[f"terms_topics_{self.dataset}"] = topics logging.info(f'{n_found} documents processed from a possible ' f'{len(cleaned_json)}, of which ' f'{n_topics} have been assigned topics.') fields.add(f"terms_topics_{self.dataset}") fields.add("terms_of_countryTags") fields.add("type_of_entity") # Prepare connection to ES prod_label = '' if self.production else '_dev' es_config = get_config('elasticsearch.config', 'clio') es_config['index'] = f"clio_{self.dataset}{prod_label}" aws_auth_region = es_config.pop('region') es = ElasticsearchPlus(hosts=es_config['host'], port=int(es_config['port']), use_ssl=True, entity_type=self.dataset, aws_auth_region=aws_auth_region, country_detection=True, caps_to_camel_case=True) # Dynamically generate the mapping based on a template with open("clio_mapping.json") as f: mapping = json.load(f) for f in fields: kwargs = {} _type = "text" if f.startswith("terms"): kwargs = { "fields": { "keyword": { "type": "keyword" } }, "analyzer": "terms_analyzer" } elif not f.startswith("textBody"): _type = "keyword" mapping["mappings"]["_doc"]["properties"][f] = dict(type=_type, **kwargs) # Drop, create and send data if es.indices.exists(index=es_config['index']): es.indices.delete(index=es_config['index']) es.indices.create(index=es_config['index'], body=mapping) for id_, row in zip(uid, cleaned_json): es.index(index=es_config['index'], doc_type=es_config['type'], id=id_, body=row) # Drop, create and send data es = ElasticsearchPlus(hosts=es_config['host'], port=int(es_config['port']), use_ssl=True, entity_type='topics', aws_auth_region=aws_auth_region, country_detection=False, caps_to_camel_case=False) topic_idx = f"{es_config['index']}_topics" if es.indices.exists(index=topic_idx): es.indices.delete(index=topic_idx) es.indices.create(index=topic_idx) es.index(index=topic_idx, doc_type=es_config['type'], id='topics', body=topic_lookup) # Touch the checkpoint self.output().touch()