Beispiel #1
0
    def run(self):
        """Write data to ElasticSearch if required"""
        if not self.write_es:
            return

        self.cherry_picked = (f'gtr/{self.date}/'.encode('utf-8') +
                              b'COREX_TOPIC_MODEL.n_hidden_140-0.'
                              b'VECTORIZER.binary_True.'
                              b'min_df_0-001.'
                              b'text_field_abstractText'
                              b'.NGRAM.TEST_False.json')
        if self.cherry_picked is None:
            # Read the topics data
            file_ptr = self.input().open("rb")
            path = file_ptr.read()
            file_ptr.close()
        else:
            path = self.cherry_picked

        file_io_topics = s3.S3Target(
            f's3://clio-data/{path.decode("utf-8")}').open("rb")

        topic_json = json.load(file_io_topics)
        file_io_topics.close()
        topic_lookup = topic_json['data']['topic_names']
        topic_json = {row['id']: row for row in topic_json['data']['rows']}

        # Read the raw data
        file_io_input = s3.S3Target(self.s3_path_in).open("rb")
        dirty_json = json.load(file_io_input)
        file_io_input.close()
        uid, cleaned_json, fields = clean(dirty_json, self.dataset)

        # Assign topics
        n_topics, n_found = 0, 0
        for row in cleaned_json:
            id_ = row[f'id_of_{self.dataset}']
            if id_ not in topic_json:
                continue
            topics = [
                k for k, v in topic_json[id_].items() if k != 'id' and v >= 0.2
            ]
            n_found += 1
            if len(topics) > 0:
                n_topics += 1
            row[f"terms_topics_{self.dataset}"] = topics
        logging.info(f'{n_found} documents processed from a possible '
                     f'{len(cleaned_json)}, of which '
                     f'{n_topics} have been assigned topics.')
        fields.add(f"terms_topics_{self.dataset}")
        fields.add("terms_of_countryTags")
        fields.add("type_of_entity")

        # Prepare connection to ES
        prod_label = '' if self.production else '_dev'
        es_config = get_config('elasticsearch.config', 'clio')
        es_config['index'] = f"clio_{self.dataset}{prod_label}"
        aws_auth_region = es_config.pop('region')
        es = ElasticsearchPlus(hosts=es_config['host'],
                               port=int(es_config['port']),
                               use_ssl=True,
                               entity_type=self.dataset,
                               aws_auth_region=aws_auth_region,
                               country_detection=True,
                               caps_to_camel_case=True)

        # Dynamically generate the mapping based on a template
        with open("clio_mapping.json") as f:
            mapping = json.load(f)
        for f in fields:
            kwargs = {}
            _type = "text"
            if f.startswith("terms"):
                kwargs = {
                    "fields": {
                        "keyword": {
                            "type": "keyword"
                        }
                    },
                    "analyzer": "terms_analyzer"
                }
            elif not f.startswith("textBody"):
                _type = "keyword"
            mapping["mappings"]["_doc"]["properties"][f] = dict(type=_type,
                                                                **kwargs)

        # Drop, create and send data
        if es.indices.exists(index=es_config['index']):
            es.indices.delete(index=es_config['index'])
        es.indices.create(index=es_config['index'], body=mapping)
        for id_, row in zip(uid, cleaned_json):
            es.index(index=es_config['index'],
                     doc_type=es_config['type'],
                     id=id_,
                     body=row)

        # Drop, create and send data
        es = ElasticsearchPlus(hosts=es_config['host'],
                               port=int(es_config['port']),
                               use_ssl=True,
                               entity_type='topics',
                               aws_auth_region=aws_auth_region,
                               country_detection=False,
                               caps_to_camel_case=False)
        topic_idx = f"{es_config['index']}_topics"
        if es.indices.exists(index=topic_idx):
            es.indices.delete(index=topic_idx)
        es.indices.create(index=topic_idx)
        es.index(index=topic_idx,
                 doc_type=es_config['type'],
                 id='topics',
                 body=topic_lookup)

        # Touch the checkpoint
        self.output().touch()
Beispiel #2
0
def run():
    start_index = os.environ["BATCHPAR_start_index"]
    end_index = os.environ["BATCHPAR_end_index"]
    #mysqldb_config = os.environ["BATCHPAR_config"]
    es_host = os.environ["BATCHPAR_outinfo"]
    es_port = os.environ["BATCHPAR_out_port"]
    es_index = os.environ["BATCHPAR_out_index"]
    es_type = os.environ["BATCHPAR_out_type"]
    entity_type = os.environ["BATCHPAR_entity_type"]
    db = os.environ["BATCHPAR_db"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # Read in the US states
    static_engine = get_mysql_engine("BATCHPAR_config", "mysqldb",
                                     "static_data")
    states_lookup = {
        row['state_code']: row['state_name']
        for _, row in pd.read_sql_table('us_states_lookup',
                                        static_engine).iterrows()
    }
    states_lookup[None] = None
    states_lookup[''] = None

    # Get continent lookup
    continent_lookup = get_continent_lookup()

    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db)
    Session = sessionmaker(bind=engine)
    session = Session()

    cols = [
        "application_id", "full_project_num", "fy", "org_city", "org_country",
        "org_state", "org_zipcode", "org_name", "project_start", "project_end",
        "project_terms", "project_title", "total_cost", "phr", "ic_name"
    ]
    cols_attrs = [getattr(Projects, c) for c in cols]
    batch_selection = session.query(*cols_attrs).filter(
        Projects.application_id >= start_index,
        Projects.application_id <= end_index).selectable
    df = pd.read_sql(batch_selection, session.bind)
    df.columns = [c[13::]
                  for c in df.columns]  # remove the 'nih_projects_' prefix

    # geocode the dataframe
    df = df.rename(columns={'org_city': 'city', 'org_country': 'country'})
    df = geocode_dataframe(df)

    # append iso codes for country
    df = country_iso_code_dataframe(df)

    # clean start and end dates
    for col in ["project_start", "project_end"]:
        df[col] = df[col].apply(lambda x: _extract_date(x))

    # currency is the same for the whole dataset
    df['total_cost_currency'] = 'USD'

    # output to elasticsearch
    field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
                                                 "health_scanner.json")
    strans_kwargs = {
        'filename': 'nih.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['application_id']
    }

    es = ElasticsearchPlus(
        hosts=es_host,
        port=es_port,
        aws_auth_region=aws_auth_region,
        no_commit=("AWSBATCHTEST" in os.environ),
        entity_type=entity_type,
        strans_kwargs=strans_kwargs,
        field_null_mapping=field_null_mapping,
        null_empty_str=True,
        coordinates_as_floats=True,
        country_detection=True,
        listify_terms=True,
        terms_delimiters=(";", ","),
        caps_to_camel_case=True,
        null_pairs={"currency_total_cost": "cost_total_project"})

    for _, row in df.iterrows():
        doc = dict(row.loc[~pd.isnull(row)])
        if 'country' in doc:
            # Try to patch broken US data
            if doc['country'] == '' and doc['org_state'] != '':
                doc['country'] = "United States"
                doc['continent'] = "NA"
            doc['placeName_state_organisation'] = states_lookup[
                doc['org_state']]

            if 'continent' in doc:
                continent_code = doc['continent']
            else:
                continent_code = None
            doc['placeName_continent_organisation'] = continent_lookup[
                continent_code]

        if 'ic_name' in doc:
            doc['ic_name'] = [doc['ic_name']]

        uid = doc.pop("application_id")
        es.index(index=es_index, doc_type=es_type, id=uid, body=doc)
Beispiel #3
0
def run():

    # Fetch the input parameters
    s3_bucket = os.environ["BATCHPAR_bucket"]
    batch_file = os.environ["BATCHPAR_batch_file"]
    members_perc = int(os.environ["BATCHPAR_members_perc"])
    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]
    routine_id = os.environ["BATCHPAR_routine_id"]

    # Get continent lookup
    url = ("https://nesta-open-data.s3.eu-west-2"
           ".amazonaws.com/rwjf-viz/continent_codes_names.json")
    continent_lookup = {row["Code"]: row["Name"] 
                        for row in requests.get(url).json()}
    continent_lookup[None] = None

    # Extract the core topics
    logging.debug('Getting topics')
    s3 = boto3.resource('s3')
    topics_key = f'meetup-topics-{routine_id}.json'
    topics_obj = s3.Object(s3_bucket, topics_key)
    core_topics = set(json.loads(topics_obj.get()['Body']._raw_stream.read()))

    # Extract the group ids for this task
    ids_obj = s3.Object(s3_bucket, batch_file)
    group_ids = set(json.loads(ids_obj.get()['Body']._raw_stream.read()))

    # Extract the mesh terms for this task
    mesh_obj = s3.Object('innovation-mapping-general', 
                         'meetup_mesh/meetup_mesh_processed.txt')
    df_mesh = retrieve_mesh_terms('innovation-mapping-general',
                                  'meetup_mesh/meetup_mesh_processed.txt')
    mesh_terms = format_mesh_terms(df_mesh)

    # Setup ES+
    field_null_mapping = load_json_from_pathstub(("tier_1/"
                                                  "field_null_mappings/"),
                                                 "health_scanner.json")
    strans_kwargs={'filename':'meetup.json',
                   'from_key':'tier_0',
                   'to_key':'tier_1',
                   'ignore':[]}
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           field_null_mapping=field_null_mapping,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           country_detection=True,
                           auto_translate=True)

    # Generate the lookup for geographies
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    geo_lookup = {}
    with db_session(engine) as session:
        query_result = session.query(Geographic).all()
        for geography in query_result:
            geo_lookup[geography.id] = {k: v for k, v in 
                                        geography.__dict__.items()
                                        if k in geography.__table__.columns}

    # Pipe the groups
    members_limit = get_members_by_percentile(engine, perc=members_perc)
    with db_session(engine) as session:
        query_result = (session
                        .query(Group)
                        .filter(Group.members >= members_limit)
                        .filter(Group.id.in_(group_ids))
                        .all())
        for count, group in enumerate(query_result, 1):
            row = {k: v for k, v in group.__dict__.items()
                   if k in group.__table__.columns}

            # Filter groups without the required topics
            topics = [topic['name'] for topic in group.topics
                      if topic['name'] in core_topics]
            if len(topics) == 0:
                continue

            # Assign mesh terms
            mesh_id = f'{row["id"]}'.zfill(8)
            row['mesh_terms'] = None
            if mesh_id in mesh_terms:
                row['mesh_terms'] = mesh_terms[mesh_id]

            # Get the geographic data for this row
            country_name = country_iso_code_to_name(row['country'], iso2=True)
            geo_key = generate_composite_key(row['city'], country_name)
            geo = geo_lookup[geo_key]

            # Clean up the input data
            row['topics'] = topics
            row['urlname'] = f"https://www.meetup.com/{row['urlname']}"
            row['coordinate'] = dict(lat=geo['latitude'], lon=geo['longitude'])
            row['created'] = dt.strftime(dt.fromtimestamp(row['created']/1000), 
                                         format="%Y-%m-%d")
            if row['description'] is not None:
                row['description'] = BeautifulSoup(row['description'], 'lxml').text                
            row['continent'] = continent_lookup[geo['continent']]
            row['country_name'] = geo['country']
            row['continent_id'] = geo['continent']
            row['country'] = geo['country_alpha_2']
            row['iso3'] = geo['country_alpha_3']
            row['isoNumeric'] = geo['country_numeric']

            # Insert to ES
            _row = es.index(index=es_index, doc_type=es_type,
                            id=row['id'], body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to elasticsearch")

    logging.info("Batch job complete.")
Beispiel #4
0
def run():

    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    static_engine = get_mysql_engine("BATCHPAR_config", "mysqldb",
                                     "static_data")
    states_lookup = {
        row['state_code']: row['state_name']
        for _, row in pd.read_sql_table('us_states_lookup',
                                        static_engine).iterrows()
    }
    states_lookup["AE"] = "Armed Forces (Canada, Europe, Middle East)"
    states_lookup["AA"] = "Armed Forces (Americas)"
    states_lookup["AP"] = "Armed Forces (Pacific)"
    states_lookup[None] = None  # default lookup for non-US countries

    # Get continent lookup
    url = "https://nesta-open-data.s3.eu-west-2.amazonaws.com/rwjf-viz/continent_codes_names.json"
    continent_lookup = {
        row["Code"]: row["Name"]
        for row in requests.get(url).json()
    }
    continent_lookup[None] = None

    # es setup
    field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
                                                 "health_scanner.json")
    strans_kwargs = {
        'filename': 'crunchbase_organisation_members.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(
        hosts=es_host,
        port=es_port,
        aws_auth_region=aws_auth_region,
        no_commit=("AWSBATCHTEST" in os.environ),
        entity_type=entity_type,
        strans_kwargs=strans_kwargs,
        field_null_mapping=field_null_mapping,
        null_empty_str=True,
        coordinates_as_floats=True,
        country_detection=True,
        listify_terms=True,
        terms_delimiters=("|", ),
        null_pairs={"currency_of_funding": "cost_of_funding"})

    # collect file
    nrows = 20 if test else None

    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    org_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(org_ids)} organisations retrieved from s3")

    org_fields = set(c.name for c in Organization.__table__.columns)

    geo_fields = [
        'country_alpha_2', 'country_alpha_3', 'country_numeric', 'continent',
        'latitude', 'longitude'
    ]

    # First get all funders
    investor_names = defaultdict(list)
    with db_session(engine) as session:
        rows = (session.query(Organization, FundingRound).join(
            FundingRound, Organization.id == FundingRound.company_id).filter(
                Organization.id.in_(org_ids)).all())
        for row in rows:
            _id = row.Organization.id
            _investor_names = row.FundingRound.investor_names
            investor_names[_id] += parse_investor_names(_investor_names)

    # Pipe orgs to ES
    with db_session(engine) as session:
        rows = (session.query(Organization, Geographic).join(
            Geographic, Organization.location_id == Geographic.id).filter(
                Organization.id.in_(org_ids)).limit(nrows).all())
        for count, row in enumerate(rows, 1):
            # convert sqlalchemy to dictionary
            row_combined = {
                k: v
                for k, v in row.Organization.__dict__.items()
                if k in org_fields
            }
            row_combined[
                'currency_of_funding'] = 'USD'  # all values are from 'funding_total_usd'
            row_combined.update({
                k: v
                for k, v in row.Geographic.__dict__.items() if k in geo_fields
            })
            row_combined['investor_names'] = list(
                set(investor_names[row_combined['id']]))

            # reformat coordinates
            row_combined['coordinates'] = {
                'lat': row_combined.pop('latitude'),
                'lon': row_combined.pop('longitude')
            }

            # iterate through categories and groups
            row_combined['category_list'] = []
            row_combined['category_group_list'] = []
            for category in (session.query(CategoryGroup).select_from(
                    OrganizationCategory).join(CategoryGroup).filter(
                        OrganizationCategory.organization_id ==
                        row.Organization.id).all()):
                row_combined['category_list'].append(category.category_name)
                row_combined['category_group_list'] += [
                    group
                    for group in str(category.category_group_list).split('|')
                    if group is not 'None'
                ]

            # Add a field for US state name
            state_code = row_combined['state_code']
            row_combined['placeName_state_organisation'] = states_lookup[
                state_code]
            continent_code = row_combined['continent']
            row_combined[
                'placeName_continent_organisation'] = continent_lookup[
                    continent_code]
            row_combined['updated_at'] = row_combined['updated_at'].strftime(
                '%Y-%m-%d %H:%M:%S')

            uid = row_combined.pop('id')
            _row = es.index(index=es_index,
                            doc_type=es_type,
                            id=uid,
                            body=row_combined)
            if not count % 1000:
                logging.info(f"{count} rows loaded to elasticsearch")

    logging.warning("Batch job complete.")
Beispiel #5
0
def run():
    bucket = os.environ["BATCHPAR_s3_bucket"]
    abstract_file = os.environ["BATCHPAR_s3_key"]
    dupe_file = os.environ["BATCHPAR_dupe_file"]
    es_config = literal_eval(os.environ["BATCHPAR_outinfo"])
    db = os.environ["BATCHPAR_db"]
    entity_type = os.environ["BATCHPAR_entity_type"]

    # mysql setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db)
    Session = sessionmaker(bind=engine)
    session = Session()

    # retrieve a batch of meshed terms
    mesh_terms = retrieve_mesh_terms(bucket, abstract_file)
    mesh_terms = format_mesh_terms(mesh_terms)
    logging.info(f'batch {abstract_file} contains '
                 f'{len(mesh_terms)} meshed abstracts')

    # retrieve duplicate map
    dupes = retrieve_duplicate_map(bucket, dupe_file)
    dupes = format_duplicate_map(dupes)

    # Set up elastic search connection
    field_null_mapping = load_json_from_pathstub(
        "tier_1/"
        "field_null_mappings/", "health_scanner.json")
    es = ElasticsearchPlus(hosts=es_config['host'],
                           port=es_config['port'],
                           aws_auth_region=es_config['region'],
                           use_ssl=True,
                           entity_type=entity_type,
                           strans_kwargs=None,
                           field_null_mapping=field_null_mapping,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           country_detection=True,
                           listify_terms=True)
    all_es_ids = get_es_ids(es, es_config)

    docs = []
    for doc_id, terms in mesh_terms.items():
        if doc_id not in all_es_ids:
            continue
        try:
            _filter = Abstracts.application_id == doc_id
            abstract = (session.query(Abstracts).filter(_filter).one())
        except NoResultFound:
            logging.warning(f'Not found {doc_id} in database')
            raise NoResultFound(doc_id)
        clean_abstract_text = clean_abstract(abstract.abstract_text)
        docs.append({
            'doc_id': doc_id,
            'terms_mesh_abstract': terms,
            'textBody_abstract_project': clean_abstract_text
        })
        duped_docs = dupes.get(doc_id, [])
        if len(duped_docs) > 0:
            logging.info(f'Found {len(duped_docs)} duplicates')
        for duped_doc in duped_docs:
            docs.append({
                'doc_id': duped_doc,
                'terms_mesh_abstract': terms,
                'textBody_abstract_project': clean_abstract_text,
                'booleanFlag_duplicate_abstract': True
            })

    # output to elasticsearch
    logging.warning(f'Writing {len(docs)} documents to elasticsearch')
    for doc in docs:
        uid = doc.pop("doc_id")
        # Extract existing info
        existing = es.get(es_config['index'],
                          doc_type=es_config['type'],
                          id=uid)['_source']
        # Merge existing info into new doc
        doc = {**existing, **doc}
        es.index(index=es_config['index'],
                 doc_type=es_config['type'],
                 id=uid,
                 body=doc)
Beispiel #6
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    logging.info('Retrieving engine connection')
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    logging.info('Building FOS lookup')
    fos_lookup = build_fos_lookup(engine, max_lvl=6)

    nf = NutsFinder()

    # es setup
    logging.info('Connecting to ES')
    strans_kwargs = {
        'filename': 'eurito/arxiv-eu.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           listify_terms=True,
                           do_sort=False,
                           ngram_fields=['textBody_abstract_article'])

    # collect file
    logging.info('Retrieving article ids')
    nrows = 20 if test else None
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    art_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(art_ids)} article IDs " "retrieved from s3")

    # Get all grid countries
    # and country: continent lookup
    logging.info('Doing country lookup')
    country_lookup = get_country_region_lookup()
    eu_countries = get_eu_countries()
    with db_session(engine) as session:
        grid_regions = {
            obj.id: country_lookup[obj.country_code]
            for obj in session.query(Inst).all()
            if obj.country_code is not None
        }
        grid_countries = {
            obj.id: obj.country_code
            for obj in session.query(Inst).all()
            if obj.country_code is not None
        }
        grid_institutes = {
            obj.id: obj.name
            for obj in session.query(Inst).all()
        }
        grid_latlon = {
            obj.id: (obj.latitude, obj.longitude)
            for obj in session.query(Inst).all()
        }

    #
    logging.info('Processing rows')
    with db_session(engine) as session:
        for count, obj in enumerate(
            (session.query(Art).filter(Art.id.in_(art_ids)).all())):
            row = object_to_dict(obj)
            # Extract year from date
            if row['created'] is not None:
                row['year'] = row['created'].year

            # Normalise citation count for searchkit
            if row['citation_count'] is None:
                row['citation_count'] = 0

            # Extract field of study
            row['fields_of_study'] = make_fos_tree(row['fields_of_study'],
                                                   fos_lookup)
            row['_fields_of_study'] = [
                f for fields in row['fields_of_study']['nodes'] for f in fields
                if f != []
            ]

            # Format hierarchical fields as expected by searchkit
            row['categories'] = [
                cat['description'] for cat in row.pop('categories')
            ]
            institutes = row.pop('institutes')
            good_institutes = [
                i['institute_id'] for i in institutes
                if i['matching_score'] > 0.9
            ]

            # Add NUTS regions
            for inst_id in good_institutes:
                if inst_id not in grid_latlon:
                    continue
                lat, lon = grid_latlon[inst_id]
                if lat is None or lon is None:
                    continue
                nuts = nf.find(lat=lat, lon=lon)
                for i in range(0, 4):
                    name = f'nuts_{i}'
                    if name not in row:
                        row[name] = set()
                    for nut in nuts:
                        if nut['LEVL_CODE'] != i:
                            continue
                        row[name].add(nut['NUTS_ID'])
            for i in range(0, 4):
                name = f'nuts_{i}'
                if name in row:
                    row[name] = list(row[name])

            # Add other geographies
            countries = set(grid_countries[inst_id]
                            for inst_id in good_institutes
                            if inst_id in grid_countries)
            regions = set(grid_regions[inst_id] for inst_id in good_institutes
                          if inst_id in grid_countries)
            row['countries'] = list(countries)  #[c for c, r in countries]
            row['regions'] = [r for c, r in regions]
            row['is_eu'] = any(c in eu_countries for c in countries)

            # Pull out international institute info
            has_mn = any(
                is_multinational(inst, grid_countries.values())
                for inst in good_institutes)
            row['has_multinational'] = has_mn

            # Generate author & institute properties
            mag_authors = row.pop('mag_authors')
            if mag_authors is None:
                row['authors'] = None
                row['institutes'] = None
            else:
                if all('author_order' in a for a in mag_authors):
                    mag_authors = sorted(mag_authors,
                                         key=lambda a: a['author_order'])
                row['authors'] = [
                    author['author_name'].title() for author in mag_authors
                ]
                gids = [
                    author['affiliation_grid_id'] for author in mag_authors
                    if 'affiliation_grid_id' in author
                ]
                row['institutes'] = [
                    grid_institutes[g].title() for g in gids
                    if g in grid_institutes and g in good_institutes
                ]
            if row['institutes'] in (None, []):
                row['institutes'] = [
                    grid_institutes[g].title() for g in good_institutes
                ]

            uid = row.pop('id')
            _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to " "elasticsearch")

    logging.warning("Batch job complete.")
Beispiel #7
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    logging.info('Retrieving engine connection')
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    _engine = get_mysql_engine("BATCHPAR_config", "readonly",
                               "patstat_2019_05_13")

    # es setup
    logging.info('Connecting to ES')
    strans_kwargs = {
        'filename': 'eurito/patstat-eu.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['id']
    }
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           auto_translate=True,
                           auto_translate_kwargs={'min_len': 20},
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           do_sort=True,
                           ngram_fields=['textBody_abstract_patent'])

    # collect file
    logging.info('Retrieving patent family ids')
    nrows = 20 if test else None
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    docdb_fam_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(docdb_fam_ids)} patent family IDs "
                 "retrieved from s3")

    eu_countries = get_eu_countries()

    logging.info('Processing rows')
    _filter = ApplnFamily.docdb_family_id.in_(docdb_fam_ids)
    with db_session(engine) as session:
        for obj in session.query(ApplnFamily).filter(_filter).all():
            row = object_to_dict(obj)
            appln_ids = row.pop('appln_id')
            with db_session(_engine) as _session:
                _titles = metadata(Tls202ApplnTitle, _session, appln_ids)
                _abstrs = metadata(Tls203ApplnAbstr, _session, appln_ids)
                ipcs = metadata(Tls209ApplnIpc, _session, appln_ids)
                nace2s = metadata(Tls229ApplnNace2, _session, appln_ids)
                techs = metadata(Tls230ApplnTechnField, _session, appln_ids)
                # Get persons
                _pers_applns = metadata(Tls207PersAppln, _session, appln_ids)
                pers_ids = set(pa['person_id'] for pa in _pers_applns)
                persons = metadata(Tls906Person,
                                   _session,
                                   pers_ids,
                                   field_selector=Tls906Person.person_id)

            title = select_text(_titles, 'appln_title_lg', 'appln_title')
            abstr = select_text(_abstrs, 'appln_abstract_lg', 'appln_abstract')

            # Get names from lookups
            ipcs = list(set(i['ipc_class_symbol'].split()[0] for i in ipcs))
            nace2s = list(set(n['nace2_code'] for n in nace2s))
            techs = list(set(t['techn_field_nr'] for t in techs))
            ctrys = list(set(p['person_ctry_code'] for p in persons))
            nuts = list(set(p['nuts'] for p in persons))
            is_eu = any(c in eu_countries for c in ctrys)

            # Index the data
            row = dict(title=title,
                       abstract=abstr,
                       ipc=ipcs,
                       nace2=nace2s,
                       tech=techs,
                       ctry=ctrys,
                       nuts=nuts,
                       is_eu=is_eu,
                       **row)
            uid = row.pop('docdb_family_id')
            _row = es.index(index=es_index, doc_type=es_type, id=uid, body=row)

    logging.warning("Batch job complete.")
Beispiel #8
0
def run():
    test = literal_eval(os.environ["BATCHPAR_test"])
    bucket = os.environ['BATCHPAR_bucket']
    batch_file = os.environ['BATCHPAR_batch_file']

    db_name = os.environ["BATCHPAR_db_name"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_index = os.environ['BATCHPAR_out_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # database setup
    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db_name)
    fos_lookup = build_fos_lookup(engine)
    
    # Setup ngrammer
    os.environ['MYSQLDBCONF'] = os.environ['BATCHPAR_config']
    ngrammer = Ngrammer(database="production")

    # es setup
    strans_kwargs={'filename':'arxiv.json',
                   'from_key':'tier_0',
                   'to_key':'tier_1',
                   'ignore':['id']}
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           strans_kwargs=strans_kwargs,
                           null_empty_str=True,
                           coordinates_as_floats=True,
                           listify_terms=True,
                           do_sort=False)

    # collect file
    nrows = 20 if test else None

    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, batch_file)
    art_ids = json.loads(obj.get()['Body']._raw_stream.read())
    logging.info(f"{len(art_ids)} article IDs "
                 "retrieved from s3")
    
    # Get all grid countries
    # and country: continent lookup
    country_lookup = get_country_region_lookup()                
    with db_session(engine) as session:
        grid_countries = {obj.id: country_lookup[obj.country_code]
                          for obj in session.query(Institute).all()
                          if obj.country_code is not None}
        grid_institutes = {obj.id: obj.name
                           for obj in session.query(Institute).all()}
    #
    current_year = dt.now().year
    with db_session(engine) as session:
        for count, obj in enumerate((session.query(Article)
                                     .filter(Article.id.in_(art_ids))
                                     .all())):
            row = object_to_dict(obj)
            # Extract year from date
            year = 1990
            if row['created'] is not None:
                row['year'] = row['created'].year
                year = row['created'].year

            # Normalise citation count for searchkit
            if row['citation_count'] is None:
                row['citation_count'] = 0
            row['normalised_citation'] = row['citation_count']/np.log(current_year-year+2)

            # If abstract doesn't meet requirements, zero novelty
            # all other novelty will be assigned in a later task
            text = row['abstract'] + ' ' + row['title']
            if (len(text) < 400
                or any(x in row['abstract'].lower()
                       for x in ('withdrawn', 'arxiv administrators'))
                or any(x in row['title'].lower()
                       for x in ('reply to', 'reply on', 
                                 'comment to', 'comment on',
                                 'remarks to', 'remarks on'))):
                row['novelty_of_article'] = 0

            processed_tokens = ngrammer.process_document(row['abstract'])
            row['tokens'] = [t.replace('_', ' ') 
                             for tokens in processed_tokens
                             for t in tokens]

            # Extract field of study Level 0 --> Level 1 paths
            fos = []
            fos_objs = row.pop('fields_of_study')
            fos_ids = set(fos['id'] for fos in fos_objs)
            for f in fos_objs:
                if f['level'] > 0:
                    continue
                fos += [reversed(fos_lookup[(f['id'], cid)])
                        for cid in split_ids(f['child_ids'])
                        if cid in fos_ids]

            # Format hierarchical fields as expected by searchkit
            cats = [(cat['description'], cat['id'].split('.')[0])
                    for cat in row.pop('categories')]
            institutes = row.pop('institutes')
            good_institutes = [i['institute_id'] for i in institutes
                               if i['matching_score'] > 0.9]
            countries = set(grid_countries[inst_id]
                            for inst_id in good_institutes
                            if inst_id in grid_countries)
            row['categories'], _, _ = hierarchy_field(cats)
            row['fos'], _, _ = hierarchy_field(fos)
            row['countries'], _, _ = hierarchy_field(countries)

            # Pull out international institute info
            has_mn = any(is_multinational(inst,
                                          grid_countries.values())
                         for inst in good_institutes)
            row['has_multinational'] = has_mn

            # Generate author & institute properties
            mag_authors = row.pop('mag_authors')
            if mag_authors is None:
                row['authors'] = None
                row['institutes'] = None
                #row['novelty_of_article'] = 0
            else:
                if all('author_order' in a for a in mag_authors):
                    mag_authors = sorted(mag_authors,
                                         key=lambda a: a['author_order'])

                row['authors'] = [author['author_name'].title()
                                  for author in mag_authors]
                if len(row['authors']) > 10:
                    row['authors'] = [f"{row['authors'][0]}, et al"]

                gids = [author['affiliation_grid_id']
                        for author in mag_authors
                        if 'affiliation_grid_id' in author]
                row['institutes'] = [grid_institutes[g].title()
                                     for g in gids
                                     if g in grid_institutes
                                     and g in good_institutes]
                #row['novelty_of_article'] = novelty0 + np.log(novelty1+1)            

            uid = row.pop('id')
            _row = es.index(index=es_index, doc_type=es_type,
                            id=uid, body=row)
            if not count % 1000:
                logging.info(f"{count} rows loaded to elasticsearch")

    logging.warning("Batch job complete.")
Beispiel #9
0
def run():

    # Fetch the input parameters
    s3_bucket = os.environ["BATCHPAR_bucket"]
    batch_file = os.environ["BATCHPAR_batch_file"]
    es_host = os.environ['BATCHPAR_outinfo']
    es_port = int(os.environ['BATCHPAR_out_port'])
    es_new_index = os.environ['BATCHPAR_out_index']
    es_old_index = os.environ['BATCHPAR_in_index']
    es_type = os.environ['BATCHPAR_out_type']
    entity_type = os.environ["BATCHPAR_entity_type"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # Extract the article ids in this chunk
    s3 = boto3.resource('s3')
    ids_obj = s3.Object(s3_bucket, batch_file)
    art_ids = json.loads(ids_obj.get()['Body']._raw_stream.read())
    logging.info(f'Processing {len(art_ids)} article ids')

    field_null_mapping = load_json_from_pathstub(("tier_1/"
                                                  "field_null_mappings/"),
                                                 "health_scanner.json")
    es = ElasticsearchPlus(hosts=es_host,
                           port=es_port,
                           aws_auth_region=aws_auth_region,
                           no_commit=("AWSBATCHTEST" in os.environ),
                           entity_type=entity_type,
                           field_null_mapping=field_null_mapping,
                           send_get_body_as='POST')

    # Iterate over article IDs
    processed_ids = set()
    for _id in art_ids:
        if _id in processed_ids:  # To avoid duplicated effort
            continue

        # Collect all duplicated data together
        dupe_ids = {}  # For identifying the most recent dupe
        yearly_funds = []  # The new deduped collection of annual funds
        hits = {}
        for hit in es.near_duplicates(index=es_old_index,
                                      doc_id=_id,
                                      doc_type=es_type,
                                      fields=[
                                          "textBody_descriptive_project",
                                          "title_of_project",
                                          "textBody_abstract_project"
                                      ]):
            # Extract key values
            src = hit['_source']
            hit_id = hit['_id']
            # Record this hit
            processed_ids.add(hit_id)
            hits[hit_id] = src
            # Extract year and funding info
            yearly_funds += extract_yearly_funds(src)
            year = get_value(src, 'year_fiscal_funding')
            if year is not None:
                dupe_ids[hit_id] = year

        # Get the most recent instance of the duplicates
        final_id = sorted(hits.keys())[-1]  # default if years are all null
        if len(dupe_ids) > 0:  # implies years are not all null
            final_id, year = Counter(dupe_ids).most_common()[0]
        body = hits[final_id]
        processed_ids = processed_ids.union(set(dupe_ids))

        # Sort and sum the funding
        yearly_funds = sorted(yearly_funds, key=lambda row: row['year'])
        sum_funding = sum(row['cost_ref'] for row in yearly_funds
                          if row['cost_ref'] is not None)

        # Add funding info and commit to the new index
        body['json_funding_project'] = yearly_funds
        body['cost_total_project'] = sum_funding
        body['date_start_project'] = yearly_funds[0][
            'start_date']  # just in case
        es.index(index=es_new_index, doc_type=es_type, id=final_id, body=body)

    logging.info(f'Processed {len(processed_ids)} ids')
    logging.info("Batch job complete.")