Example #1
0
 def output(self):
     '''Points to the output database engine'''
     db_config = misctools.get_config("mysqldb.config", "mysqldb")
     db_config["database"] = "production" if self.production else "dev"
     db_config["table"] = "UK Geography Lookup (dummy) "
     update_id = db_config["table"] + str(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #2
0
 def output(self):
     '''Points to the input database target'''
     update_id = "NihProcessData-%s" % self._routine_id
     db_config = misctools.get_config("mysqldb.config", "mysqldb")
     db_config["database"] = "production" if not self.test else "dev"
     db_config["table"] = "NIH process DUMMY"  # Note, not a real table
     return MySqlTarget(update_id=update_id, **db_config)
Example #3
0
 def output(self):
     '''Points to the output database engine'''
     db_config = misctools.get_config(self.db_config_path, "mysqldb")
     db_config["database"] = "production" if not self.test else "dev"
     db_config["table"] = "NIH <dummy>"  # Note, not a real table
     update_id = "NihCollectData_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #4
0
 def output(self):
     '''Points to the output database engine'''
     db_config = misctools.get_config(self.db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "MAG <dummy>"  # Note, not a real table
     update_id = "MagCollectSparql_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #5
0
def process_config(conf_prefix, test=False):
    """Fetch a NOMIS dataset from the API based on a configuration file.

    Args:
        conf_prefix (str): Configuration file name prefix, such that
                           a configuration file exists in the global
                           config file directory (see :obj:`get_config`)
                           of the form
                           'official_data/{conf_prefix}.config'
    Returns:
        df (:obj:`pd.DataFrame`): Dataframe containing NOMIS data.
    """
    # Get the configuration
    config = get_config(f"official_data/{conf_prefix}.config", "nomis")
    #logging.debug("\tGot config")
    dataset_id = config.pop("dataset")
    date_format = config.pop("date_format")
    # Iterate over NOMIS geography codes for this dataset
    geogs_list = []
    for geo_type in config.pop("geography_type").split(","):
        #logging.debug(f"\t{geo_type}")
        geographies = find_geographies(geo_type, dataset_id)
        if test:
            geographies = [geographies[0]]
        #logging.debug(f"\tGot {len(geographies)} geographies")
        geogs_list.append(geographies)
    return config, geogs_list, dataset_id, date_format
Example #6
0
    def requires(self):
        logging.getLogger().setLevel(logging.INFO)

        db_config = misctools.get_config("mysqldb.config", "mysqldb")
        db_config["database"] = "production" if self.production else "dev"
        db_config["table"] = "worldbank_countries"

        variable_codes = [
            "SP.RUR.TOTL.ZS", "SP.URB.TOTL.IN.ZS"
            "SP.POP.DPND", "SP.POP.TOTL", "SP.DYN.LE00.IN", "SP.DYN.IMRT.IN",
            "BAR.NOED.25UP.ZS", "BAR.TER.CMPT.25UP.ZS", "NYGDPMKTPSAKD",
            "SI.POV.NAHC", "SI.POV.GINI"
        ]

        job_name = (f"Worldbank-{self.date}-"
                    f"{'_'.join(variable_codes).replace('.','_')}-"
                    f"{self.production}")[0:120]

        yield WorldbankTask(
            date=self.date,
            db_config=db_config,
            variable_codes=variable_codes,
            batchable=find_filepath_from_pathstub(
                "core/batchables/collect_worldbank/"),
            env_files=[
                find_filepath_from_pathstub("/nesta/nesta"),
                find_filepath_from_pathstub("/config/mysqldb.config")
            ],
            job_def="py36_amzn1_image",
            job_name=job_name,
            job_queue="HighPriority",
            region_name="eu-west-2",
            poll_time=10,
            max_live_jobs=200,
            test=(not self.production))
Example #7
0
 def output(self):
     '''Points to the output database engine'''
     db_config = misctools.get_config(self.db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "arXlive topics <dummy>"  # Note, not a real table
     update_id = "ArxivTopicTask_{}_{}".format(self.date, self.test)
     return MySqlTarget(update_id=update_id, **db_config)
 def output(self):
     """Points to the output database engine where the task is marked as done."""
     db_config = get_config(os.environ["MYSQLDB"], "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "Example <dummy>"  # Note, not a real table
     update_id = "MyTaskWhichNeedsAName_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #9
0
 def requires(self):
     db_config = misctools.get_config("mysqldb.config", "mysqldb")
     db_config["database"] = "production" if self.production else "dev"
     db_config["table"] = "wiktionary_ngrams"
     yield CollectNgramTask(date=self.date,
                            db_config=db_config,
                            test=not self.production)
Example #10
0
 def output(self):
     '''Points to the DB target'''
     update_id = "meetup_group_details-%s" % self._routine_id
     db_config = get_config("mysqldb.config", "mysqldb")
     db_config["database"] = "production" if not self.test else "dev"
     db_config["table"] = "meetup_groups"
     return MySqlTarget(update_id=update_id, **db_config)
Example #11
0
 def output(self):
     '''Points to the output database engine'''
     db_config = get_config(self.db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "arXlive <dummy>"  # NB: not a real table
     update_id = "ArxivAnalysis_{}_{}".format(self.date, self.test)
     return mysqldb.MySqlTarget(update_id=update_id, **db_config)
Example #12
0
 def output(self):
     '''Points to the output database engine'''
     db_conf = get_config(self.db_config_path, "mysqldb")
     db_conf["database"] = 'dev' if self.test else 'production'
     db_conf["table"] = "CordisCollect <dummy>"  # not a real table
     update_id = self.job_name
     return MySqlTarget(update_id=update_id, **db_conf)
Example #13
0
 def output(self):
     '''Points to the output database engine'''
     db_config = get_config(os.environ[self.db_config_env], "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "Crunchbase <dummy>"  # Note, not a real table
     update_id = "CrunchbaseGeocodeFundingRound_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #14
0
 def output(self):
     '''Points to the output database engine'''
     self.db_config_path = os.environ[self.db_config_env]
     db_config = get_config(self.db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = f"{self.routine_id} <dummy>"  # Not a real table
     update_id = f"{self.routine_id}_{self.date}"
     return MySqlTarget(update_id=update_id, **db_config)
Example #15
0
 def output(self):
     '''Points to the output database engine'''
     db_config = misctools.get_config(self.db_config_path, "mysqldb")
     db_config["database"] = ("production" if not self.test else "dev")
     db_config["table"] = "es2es <dummy>"  # NB, not a real tbl
     update_id = "Es2Es_{}_{}_{}".format(self.date, self.origin_index,
                                         self.test)
     return MySqlTarget(update_id=update_id, **db_config)
Example #16
0
 def output(self):
     '''Points to the output database engine'''
     db_config = get_config(os.environ[self.db_config_env], "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config[
         "table"] = f"BatchGeocode{self._routine_id} <dummy>"  # Note, not a real table
     return MySqlTarget(update_id=f"BatchGeocode-{self._routine_id}",
                        **db_config)
Example #17
0
 def output(self):
     '''Points to the output database engine'''
     db_config = misctools.get_config("mysqldb.config", "mysqldb")
     db_config["database"] = "production" if self.production else "dev"
     db_config["table"] = "nomis (dummy) "
     update_id = (f"{db_config['table']} {self.date} "
                  f"{self.config_name} {self.production}")
     return MySqlTarget(update_id=update_id, **db_config)
Example #18
0
 def output(self):
     '''Points to the output database engine'''
     db_config_path = os.environ['MYSQLDB']
     db_config = misctools.get_config(db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "EURITO_patstat_pre"  # Note, not a real table
     update_id = "EURITO_patstat_pre_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #19
0
 def output(self):
     """Points to the output database engine where the task is marked as done. - For luigi updates table"""
     db_config_path = os.environ[self.db_config_env]
     db_config = misctools.get_config(db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "text2vec <dummy>"  # Note, not a real table
     update_id = "text2vectors_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #20
0
 def output(self):
     '''Points to the output database engine'''
     db_config = get_config('mysqldb.config', "mysqldb")
     db_config["database"] = "production" if self.production else "dev"
     db_config[
         "table"] = f"Clio{self.dataset} <dummy>"  # Note, not a real table
     update_id = f"Clio{self.dataset}_{self.date}"
     return MySqlTarget(update_id=update_id, **db_config)
 def output(self):
     """Points to the output database engine"""
     self.db_config_path = os.environ[self.db_config_env]
     db_config = get_config(self.db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "Crunchbase <dummy>"  # Note, not a real table
     update_id = "CrunchbaseCollectOrgData_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #22
0
 def output(self):
     '''Points to the output database engine where the task is marked as done.
     The luigi_table_updates table exists in test and production databases.
     '''
     db_config = get_config(os.environ["MYSQLDB"], 'mysqldb')
     db_config["database"] = 'dev' if self.test else 'production'
     db_config["table"] = "Example <dummy>"  # Note, not a real table
     update_id = "SimpleTask_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
 def output(self):
     '''Points to the output database engine'''
     self.db_config_path = os.environ[self.db_config_env]
     db_config = get_config(self.db_config_path, "mysqldb")
     db_config["database"] = 'dev' if self.test else 'production'
     db_config[
         "table"] = "Crunchbase to Elasticsearch <dummy>"  # Note, not a real table
     update_id = "CrunchbaseToElasticsearch_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #24
0
    def run(self):
        limit = 100 if self.test else None
        flush_freq = 33 if self.test else 5000

        # Get connection settings
        engine = get_mysql_engine('MYSQLDB', 'nesta',
                                  'dev' if self.test else 'production')
        conf = get_config('neo4j.config', 'neo4j')
        gkwargs = dict(host=conf['host'], secure=True,
                       auth=(conf['user'], conf['password']))

        # Drop all neo4j data in advance
        # (WARNING: this is a hack in lieu of proper db staging/versioning)
        with graph_session(**gkwargs) as tx:
            logging.info('Dropping all previous data')
            tx.graph.delete_all()
            for constraint in tx.run('CALL db.constraints'):
                logging.info(f'Dropping constraint {constraint[0]}')
                tx.run(f'DROP {constraint[0]}')

        # Iterate over all tables in the ORM
        for tablename, table in Base.metadata.tables.items():
            entity_name = _extract_name(tablename)
            logging.info(f'\tProcessing {entity_name}')
            orm, parent_orm, rel_name = prepare_base_entities(table)
            # Insert data to neo4j in one session per table,
            # to enable constraint and relationship lookups
            # after insertion            
            irow = 0
            uninterrupted = False
            while not uninterrupted:
                uninterrupted = True
                with graph_session(**gkwargs) as tx:
                    # Iterate over rows in the database
                    for db, orm_instance in db_session_query(query=orm,
                                                             engine=engine,
                                                             limit=limit, 
                                                             offset=irow):
                        irow += 1
                        if irow == limit:
                            break
                        # Convert the ORM row to a neo4j object, and insert
                        orm_to_neo4j(session=db, transaction=tx,
                                     orm_instance=orm_instance,
                                     parent_orm=parent_orm,
                                     rel_name=rel_name)
                        if (irow % flush_freq) == 0:
                            logging.info(f'\t\tFlushing at row {irow}')
                            uninterrupted = False
                            break
        # Confirm the task is finished
        self.output().touch()
Example #25
0
    def run(self):

        # Get connection settings
        conf = get_config('neo4j.config', 'neo4j')
        gkwargs = dict(host=conf['host'],
                       secure=True,
                       auth=(conf['user'], conf['password']))

        igr = ig.Graph()
        with graph_session(**gkwargs) as tx:
            graph = tx.graph
            logging.info('getting relationships list')
            all_rels = list(graph.relationships.match().limit(
                30000))  #finds all relationships in a graph

            #create tuple list (edgelist)
            logging.info("found %d relationships" % len(all_rels))
            tuplelist = list()
            for index, rel in enumerate(all_rels):
                if index % 1000 == 0:
                    print(index)
                #what is a better way of changing the main graph
                start_index, igr = centrality_utils.get_index(
                    rel.start_node, graph, igr)
                target_index, igr = centrality_utils.get_index(
                    rel.end_node, graph, igr)
                rel_tuple = (start_index, target_index)
                tuplelist.append(rel_tuple)

            igr.add_edges(tuplelist)

            density = igr.density(loops=False)

            logging.info("density:", density)

            betw = igr.betweenness(vertices=None,
                                   directed=False,
                                   cutoff=3,
                                   weights=None,
                                   nobigint=True)

            logging.info("betweenness:", betw)

            centrality_utils.add_betw_property(graph, igr, betw)

        logging.debug('Writing to DB complete')

        # mark as done
        logging.info("Task complete")
        self.output().touch()
Example #26
0
    def requires(self):
        '''Collects the database configurations
        and executes the central task.'''
        db_config = misctools.get_config("mysqldb.config", "mysqldb")
        db_config["database"] = "dev"

        # Prepare the input DB config
        in_db_config = db_config.copy()
        in_db_config["table"] = "muppets_input"

        # Prepare the output DB config
        out_db_config = db_config.copy()
        out_db_config["table"] = "muppets_output"

        yield SomeTask(date=self.date, max_age=40,
                       in_db_config=in_db_config,
                       out_db_config=out_db_config)
Example #27
0
    def run(self):
        pp = pprint.PrettyPrinter(indent=4, width=100)
        mag_config = misctools.get_config(self.mag_config_path, 'mag')
        mag_subscription_key = mag_config['subscription_key']

        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        Base.metadata.create_all(self.engine)

        with db_session(self.engine) as session:
            paper_fields = [
                "Id", "Ti", "F.FId", "CC", "AA.AuN", "AA.AuId", "AA.AfN",
                "AA.AfId", "AA.S"
            ]

            author_mapping = {
                'AuN': 'author_name',
                'AuId': 'author_id',
                'AfN': 'author_affiliation',
                'AfId': 'author_affiliation_id',
                'S': 'author_order'
            }

            field_mapping = {
                'Id': 'mag_id',
                'Ti': 'title',
                'F': 'fields_of_study',
                'AA': 'mag_authors',
                'CC': 'citation_count',
                'logprob': 'mag_match_prob'
            }

            logging.info(
                "Querying database for articles without fields of study")
            arxiv_ids_to_process = {
                a.id
                for a in (session.query(Article).filter(
                    ~Article.fields_of_study.any()).all())
            }
            total_arxiv_ids_to_process = len(arxiv_ids_to_process)
            logging.info(f"{total_arxiv_ids_to_process} articles to process")

            all_articles_to_update = BatchWriter(self.insert_batch_size,
                                                 update_existing_articles,
                                                 self.engine)

            batched_titles = BatchedTitles(arxiv_ids_to_process, 10000,
                                           session)
            batch_field_of_study_ids = set()

            for count, expr in enumerate(build_expr(batched_titles, 'Ti'), 1):
                logging.debug(pp.pformat(expr))
                expr_length = len(expr.split(','))
                logging.info(f"Querying MAG for {expr_length} titles")
                total_arxiv_ids_to_process -= expr_length
                batch_data = query_mag_api(expr, paper_fields,
                                           mag_subscription_key)
                logging.debug(pp.pformat(batch_data))

                returned_entities = batch_data['entities']
                logging.info(
                    f"{len(returned_entities)} entities returned from MAG (potentially including duplicates)"
                )

                # dedupe response keeping the entity with the highest logprob
                deduped_mag_ids = dedupe_entities(returned_entities)
                logging.info(
                    f"{len(deduped_mag_ids)} entities after deduplication")

                missing_articles = expr_length - len(deduped_mag_ids)
                if missing_articles != 0:
                    logging.info(f"{missing_articles} titles not found in MAG")

                batch_article_data = []

                for row in returned_entities:
                    # exclude duplicate titles
                    if row['Id'] not in deduped_mag_ids:
                        continue

                    # renaming and reformatting
                    for code, description in field_mapping.items():
                        try:
                            row[description] = row.pop(code)
                        except KeyError:
                            pass

                    for author in row.get('mag_authors', []):
                        for code, description in author_mapping.items():
                            try:
                                author[description] = author.pop(code)
                            except KeyError:
                                pass

                    if row.get('citation_count', None) is not None:
                        row['citation_count_updated'] = date.today()

                    # reformat fos_ids out of dictionaries
                    try:
                        row['fields_of_study'] = {
                            f['FId']
                            for f in row.pop('fields_of_study')
                        }
                    except KeyError:
                        row['fields_of_study'] = []
                    batch_field_of_study_ids.update(row['fields_of_study'])

                    # get list of ids which share the same title
                    try:
                        matching_articles = batched_titles[row['title']]
                    except KeyError:
                        logging.warning(
                            f"Returned title not found in original data: {row['title']}"
                        )
                        continue

                    # drop unnecessary fields
                    for f in ['prob', 'title']:
                        del row[f]

                    # add each matching article for this title to the batch
                    for article_id in matching_articles:
                        batch_article_data.append({**row, 'id': article_id})

                # check fields of study are in database
                batch_field_of_study_ids = {
                    fos_id
                    for article in batch_article_data
                    for fos_id in article['fields_of_study']
                }
                logging.debug('Checking fields of study exist in db')
                found_fos_ids = {
                    fos.id
                    for fos in (session.query(FieldOfStudy).filter(
                        FieldOfStudy.id.in_(batch_field_of_study_ids)).all())
                }

                missing_fos_ids = batch_field_of_study_ids - found_fos_ids
                if missing_fos_ids:
                    #  query mag for details if not found
                    update_field_of_study_ids(mag_subscription_key, session,
                                              missing_fos_ids)

                # add this batch to the queue
                all_articles_to_update.extend(batch_article_data)

                logging.info(
                    f"Batch {count} done. {total_arxiv_ids_to_process} articles left to process"
                )
                if self.test and count == 2:
                    logging.warning("Exiting after 2 batches in test mode")
                    break

            # pick up any left over in the batch
            if all_articles_to_update:
                all_articles_to_update.write()

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
Example #28
0
 def output(self):
     '''Points to the input database target'''
     db_config = get_config("mysqldb.config", "mysqldb")
     db_config["database"] = "production" if not self.test else "dev"
     db_config["table"] = "gtr_table"
     return MySqlTarget(update_id=self.job_name, **db_config)
Example #29
0
 def output(self):
     db_config = get_config(os.environ[self.db_config_env], "mysqldb")
     db_config['database'] = 'dev' if self.test else 'production'
     db_config['table'] = "MeshTerms <dummy>"
     update_id = "NihJoinMeshTerms_{}".format(self.date)
     return MySqlTarget(update_id=update_id, **db_config)
Example #30
0
    def run(self):
        """Write data to ElasticSearch if required"""
        if not self.write_es:
            return

        self.cherry_picked = (f'gtr/{self.date}/'.encode('utf-8') +
                              b'COREX_TOPIC_MODEL.n_hidden_140-0.'
                              b'VECTORIZER.binary_True.'
                              b'min_df_0-001.'
                              b'text_field_abstractText'
                              b'.NGRAM.TEST_False.json')
        if self.cherry_picked is None:
            # Read the topics data
            file_ptr = self.input().open("rb")
            path = file_ptr.read()
            file_ptr.close()
        else:
            path = self.cherry_picked

        file_io_topics = s3.S3Target(
            f's3://clio-data/{path.decode("utf-8")}').open("rb")

        topic_json = json.load(file_io_topics)
        file_io_topics.close()
        topic_lookup = topic_json['data']['topic_names']
        topic_json = {row['id']: row for row in topic_json['data']['rows']}

        # Read the raw data
        file_io_input = s3.S3Target(self.s3_path_in).open("rb")
        dirty_json = json.load(file_io_input)
        file_io_input.close()
        uid, cleaned_json, fields = clean(dirty_json, self.dataset)

        # Assign topics
        n_topics, n_found = 0, 0
        for row in cleaned_json:
            id_ = row[f'id_of_{self.dataset}']
            if id_ not in topic_json:
                continue
            topics = [
                k for k, v in topic_json[id_].items() if k != 'id' and v >= 0.2
            ]
            n_found += 1
            if len(topics) > 0:
                n_topics += 1
            row[f"terms_topics_{self.dataset}"] = topics
        logging.info(f'{n_found} documents processed from a possible '
                     f'{len(cleaned_json)}, of which '
                     f'{n_topics} have been assigned topics.')
        fields.add(f"terms_topics_{self.dataset}")
        fields.add("terms_of_countryTags")
        fields.add("type_of_entity")

        # Prepare connection to ES
        prod_label = '' if self.production else '_dev'
        es_config = get_config('elasticsearch.config', 'clio')
        es_config['index'] = f"clio_{self.dataset}{prod_label}"
        aws_auth_region = es_config.pop('region')
        es = ElasticsearchPlus(hosts=es_config['host'],
                               port=int(es_config['port']),
                               use_ssl=True,
                               entity_type=self.dataset,
                               aws_auth_region=aws_auth_region,
                               country_detection=True,
                               caps_to_camel_case=True)

        # Dynamically generate the mapping based on a template
        with open("clio_mapping.json") as f:
            mapping = json.load(f)
        for f in fields:
            kwargs = {}
            _type = "text"
            if f.startswith("terms"):
                kwargs = {
                    "fields": {
                        "keyword": {
                            "type": "keyword"
                        }
                    },
                    "analyzer": "terms_analyzer"
                }
            elif not f.startswith("textBody"):
                _type = "keyword"
            mapping["mappings"]["_doc"]["properties"][f] = dict(type=_type,
                                                                **kwargs)

        # Drop, create and send data
        if es.indices.exists(index=es_config['index']):
            es.indices.delete(index=es_config['index'])
        es.indices.create(index=es_config['index'], body=mapping)
        for id_, row in zip(uid, cleaned_json):
            es.index(index=es_config['index'],
                     doc_type=es_config['type'],
                     id=id_,
                     body=row)

        # Drop, create and send data
        es = ElasticsearchPlus(hosts=es_config['host'],
                               port=int(es_config['port']),
                               use_ssl=True,
                               entity_type='topics',
                               aws_auth_region=aws_auth_region,
                               country_detection=False,
                               caps_to_camel_case=False)
        topic_idx = f"{es_config['index']}_topics"
        if es.indices.exists(index=topic_idx):
            es.indices.delete(index=topic_idx)
        es.indices.create(index=topic_idx)
        es.index(index=topic_idx,
                 doc_type=es_config['type'],
                 id='topics',
                 body=topic_lookup)

        # Touch the checkpoint
        self.output().touch()