Example #1
0
 def test_string_date_pattern(self):
     assert _extract_date('Sep 21 2017') == '2017-09-21'
     assert _extract_date('Mar  1 2011') == '2011-03-01'
     assert _extract_date('Apr  7 2009') == '2009-04-07'
     assert _extract_date('January 2016') == '2016-01-01'
     assert _extract_date('Oct 2014') == '2014-10-01'
     assert _extract_date('2015') == '2015-01-01'
     assert _extract_date('6 April 2018') == '2018-04-06'
     assert _extract_date('8 Dec, 2010') == '2010-12-08'
Example #2
0
def run():
    start_index = os.environ["BATCHPAR_start_index"]
    end_index = os.environ["BATCHPAR_end_index"]
    #mysqldb_config = os.environ["BATCHPAR_config"]
    es_host = os.environ["BATCHPAR_outinfo"]
    es_port = os.environ["BATCHPAR_out_port"]
    es_index = os.environ["BATCHPAR_out_index"]
    es_type = os.environ["BATCHPAR_out_type"]
    entity_type = os.environ["BATCHPAR_entity_type"]
    db = os.environ["BATCHPAR_db"]
    aws_auth_region = os.environ["BATCHPAR_aws_auth_region"]

    # Read in the US states
    static_engine = get_mysql_engine("BATCHPAR_config", "mysqldb",
                                     "static_data")
    states_lookup = {
        row['state_code']: row['state_name']
        for _, row in pd.read_sql_table('us_states_lookup',
                                        static_engine).iterrows()
    }
    states_lookup[None] = None
    states_lookup[''] = None

    # Get continent lookup
    continent_lookup = get_continent_lookup()

    engine = get_mysql_engine("BATCHPAR_config", "mysqldb", db)
    Session = sessionmaker(bind=engine)
    session = Session()

    cols = [
        "application_id", "full_project_num", "fy", "org_city", "org_country",
        "org_state", "org_zipcode", "org_name", "project_start", "project_end",
        "project_terms", "project_title", "total_cost", "phr", "ic_name"
    ]
    cols_attrs = [getattr(Projects, c) for c in cols]
    batch_selection = session.query(*cols_attrs).filter(
        Projects.application_id >= start_index,
        Projects.application_id <= end_index).selectable
    df = pd.read_sql(batch_selection, session.bind)
    df.columns = [c[13::]
                  for c in df.columns]  # remove the 'nih_projects_' prefix

    # geocode the dataframe
    df = df.rename(columns={'org_city': 'city', 'org_country': 'country'})
    df = geocode_dataframe(df)

    # append iso codes for country
    df = country_iso_code_dataframe(df)

    # clean start and end dates
    for col in ["project_start", "project_end"]:
        df[col] = df[col].apply(lambda x: _extract_date(x))

    # currency is the same for the whole dataset
    df['total_cost_currency'] = 'USD'

    # output to elasticsearch
    field_null_mapping = load_json_from_pathstub("tier_1/field_null_mappings/",
                                                 "health_scanner.json")
    strans_kwargs = {
        'filename': 'nih.json',
        'from_key': 'tier_0',
        'to_key': 'tier_1',
        'ignore': ['application_id']
    }

    es = ElasticsearchPlus(
        hosts=es_host,
        port=es_port,
        aws_auth_region=aws_auth_region,
        no_commit=("AWSBATCHTEST" in os.environ),
        entity_type=entity_type,
        strans_kwargs=strans_kwargs,
        field_null_mapping=field_null_mapping,
        null_empty_str=True,
        coordinates_as_floats=True,
        country_detection=True,
        listify_terms=True,
        terms_delimiters=(";", ","),
        caps_to_camel_case=True,
        null_pairs={"currency_total_cost": "cost_total_project"})

    for _, row in df.iterrows():
        doc = dict(row.loc[~pd.isnull(row)])
        if 'country' in doc:
            # Try to patch broken US data
            if doc['country'] == '' and doc['org_state'] != '':
                doc['country'] = "United States"
                doc['continent'] = "NA"
            doc['placeName_state_organisation'] = states_lookup[
                doc['org_state']]

            if 'continent' in doc:
                continent_code = doc['continent']
            else:
                continent_code = None
            doc['placeName_continent_organisation'] = continent_lookup[
                continent_code]

        if 'ic_name' in doc:
            doc['ic_name'] = [doc['ic_name']]

        uid = doc.pop("application_id")
        es.index(index=es_index, doc_type=es_type, id=uid, body=doc)
Example #3
0
 def test_invalid_year_returns_none(self):
     assert _extract_date('no year') is None
     assert _extract_date('nan') is None
     assert _extract_date('-') is None
Example #4
0
 def test_valid_year_extract(self):
     assert _extract_date('2019') == '2019-01-01'
     assert _extract_date('sometime in 2011') == '2011-01-01'
     assert _extract_date('maybe 2019 or 2020') == '2019-01-01'
Example #5
0
 def test_invalid_day_returns_year(self):
     assert _extract_date('Mar 38 2001') == '2001-01-01'
     assert _extract_date('2000-09-40') == '2000-01-01'
     assert _extract_date('5/32/2017') == '2017-01-01'
Example #6
0
 def test_invalid_month_returns_year(self):
     assert _extract_date('Cat 12 2009') == '2009-01-01'
     assert _extract_date('2000-19-09') == '2000-01-01'
     assert _extract_date('20/4/2009') == '2009-01-01'
Example #7
0
 def test_slash_date_pattern(self):
     assert _extract_date('5/31/2020') == '2020-05-31'
     assert _extract_date('11/1/2012') == '2012-11-01'
     assert _extract_date('1/1/2010') == '2010-01-01'
     assert _extract_date('2000/12/01') == '2000-12-01'
     assert _extract_date('1999/04/20') == '1999-04-20'
Example #8
0
 def test_dash_date_pattern(self):
     assert _extract_date('2016-07-31') == '2016-07-31'
     assert _extract_date('2010-12-01') == '2010-12-01'
     assert _extract_date('2020-01-04') == '2020-01-04'