def execute(trial=False):
        '''Retrieve some data sets and store in mongodb collections.'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('ajr10_williami', 'ajr10_williami')

        # open_spaces_cambridge

        print("retrieving open spaces data from data.cambridgema.gov")

        client = sodapy.Socrata("data.cambridgema.gov", None)
        response = client.get("5ctr-ccas", limit=50)
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))

        repo.dropCollection("ajr10_williami.open_spaces_cambridge")
        repo.createCollection("ajr10_williami.open_spaces_cambridge")

        print("inserting data into target: ", "open_spaces_cambridge")
        repo["ajr10_williami.open_spaces_cambridge"].insert_many(r)

        # trees_cambridge

        print("retrieving tree data from data.cambridgema.gov")

        client = sodapy.Socrata("data.cambridgema.gov", None)
        response = client.get("q83f-7quz", limit=50)
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))

        repo.dropCollection("ajr10_williami.trees_cambridge")
        repo.createCollection("ajr10_williami.trees_cambridge")

        print("inserting data into target: ", "trees_cambridge")
        repo["ajr10_williami.trees_cambridge"].insert_many(r)

        # energy_cambridge

        print("retrieving energy data from data.cambridgema.gov")

        client = sodapy.Socrata("data.cambridgema.gov", None)
        response = client.get("es2i-g3p6", limit=50)
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))

        repo.dropCollection("ajr10_williami.energy_cambridge")
        repo.createCollection("ajr10_williami.energy_cambridge")

        print("inserting data into target: ", "energy_cambridge")
        repo["ajr10_williami.energy_cambridge"].insert_many(r)

        # logout and return start and end times
        repo.logout()
        endTime = datetime.datetime.now()
        return {"start": startTime, "end": endTime}
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('pgr_syquiac', 'pgr_syquiac')

        # Get data for hospitals
        client = sodapy.Socrata("data.cityofboston.gov", None)
        response = client.get("u6fv-m8v4", limit=30)
        repo.dropCollection("hospitals")
        repo.createCollection("hospitals")
        repo['pgr_syquiac.hospitals'].insert_many(response)

        # Get data for CDC 500 cities
        client = sodapy.Socrata("chronicdata.cdc.gov", None)
        response = client.get("csmm-fdhi",
                              CityName="Boston",
                              GeographicLevel="Census Tract",
                              limit=5000)
        repo.dropCollection("cdc")
        repo.createCollection("cdc")
        repo['pgr_syquiac.cdc'].insert_many(response)

        # Get data for all universities in the US
        url = 'http://datamechanics.io/data/pgr_syquiac/universities.json'
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("schools")
        repo.createCollection("schools")
        repo['pgr_syquiac.schools'].insert_many(r)

        # Get data for Open Swimming Pools in Boston
        client = sodapy.Socrata("data.cityofboston.gov", None)
        response = client.get("5jxx-wfpr", limit=150)
        repo.dropCollection("pools")
        repo.createCollection("pools")
        repo['pgr_syquiac.pools'].insert_many(response)

        # Get data for healthy corner stores
        client = sodapy.Socrata("data.cityofboston.gov", None)
        response = client.get("ybm6-m5qd", limit=20)
        repo.dropCollection("stores")
        repo.createCollection("stores")
        repo['pgr_syquiac.stores'].insert_many(response)

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 3
0
    def execute(trial=False):
        startTime = datetime.datetime.now()

        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('rengx_ztwu_lwj', 'rengx_ztwu_lwj')

        #public school
        repo.publicschool.drop()
        publicschool = repo.publicschool
        ss = sodapy.Socrata("data.cityofboston.gov",
                            "x92LG4iaFto5qWQGFk3lDdv6p",
                            username="******",
                            password="******")
        response = ss.get("492y-i77g")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        lis = []
        for i in range(len(r)):
            item = {}
            name = r[i]["sch_name"]
            addr = r[i]["location_location"]
            zipp = r[i]["location_zip"]
            coords = r[i]['location']['coordinates']
            #print(coords)
            xcoor = str(int(coords[1] * 1000000))
            ycoor = str(int(coords[0] * (-1000000)))
            item["x"] = xcoor
            item["y"] = ycoor
            item["name"] = name
            item["addr"] = addr
            item["zipp"] = zipp
            item["type"] = "school"
            lis.append(item)
        publicschool.insert_many(lis)

        #market
        repo.market.drop()
        market = repo.market
        ss = sodapy.Socrata("data.mass.gov",
                            "x92LG4iaFto5qWQGFk3lDdv6p",
                            username="******",
                            password="******")
        response = ss.get("66t5-f563")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        market_data = retrieve.selection(r)
        market.insert_many(market_data)

        repo.logout()
        endTime = datetime.datetime.now()
        #print("retrieve complete")
        return {"start": startTime, "end": endTime}
Esempio n. 4
0
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('pt0713_silnuext', 'pt0713_silnuext')

        client = sodapy.Socrata("data.cityofboston.gov", None)
        response = []
        limits = [0, 50001, 100001, 150001, 200001, 250001]
        for limit in limits:
            response += client.get("crime", limit=50000, offset=limit)
        s = json.dumps(response, sort_keys=True, indent=2)
        repo.dropCollection("crime")
        repo.createCollection("crime")
        repo['pt0713_silnuext.crime'].insert_many(response)
        repo['pt0713_silnuext.crime'].metadata({'complete': True})
        print(repo['pt0713_silnuext.crime'].metadata())

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 5
0
 def __init__(self, data_id=None):
     self.data_id = data_id
     self.data = None
     self.client = sodapy.Socrata(domain=lacity_url,
                                  app_token=lacity_app_token,
                                  username=lacity_user_name,
                                  password=lacity_password)
     return
Esempio n. 6
0
def get_socrata_client():
    return sodapy.Socrata(
        SO_WEB,
        SO_TOKEN,
        username=SO_USER,
        password=SO_PASS,
        timeout=60,
    )
Esempio n. 7
0
    def process_domain(self, domain):
        logger.info("Processing %s...", domain['url'])
        socrata = sodapy.Socrata(domain['url'],
                                 **domain.get('auth', {'app_token': None}))
        datasets = socrata.datasets()
        logger.info("Found %d datasets", len(datasets))
        if not datasets:
            return
        seen = set()
        for dataset in datasets:
            try:
                valid = self.process_dataset(domain, dataset)
            except Exception as e:
                sentry_sdk.capture_exception(e)
                logger.exception("Error processing dataset %s",
                                 dataset['resource']['id'])
            else:
                assert isinstance(valid, bool)
                if valid:
                    seen.add(dataset['resource']['id'])

        logger.info("Discovered %d/%d datasets", len(seen), len(datasets))

        # Clean up the datasets we didn't see
        deleted = 0
        size = 10000
        query = {
            'query': {
                'bool': {
                    'must': [
                        {
                            'term': {
                                'materialize.identifier': self.identifier,
                            },
                        },
                        {
                            'term': {
                                'materialize.socrata_domain.keyword': domain['url'],
                            },
                        },
                    ],
                },
            }
        }
        hits = self.elasticsearch.scan(
            index='datasets,pending',
            query=query,
            size=size,
            _source=['materialize.socrata_id'],
        )
        for h in hits:
            if h['_source']['materialize']['socrata_id'] not in seen:
                self.delete_dataset(full_id=h['_id'])
                deleted += 1

        if deleted:
            logger.info("Deleted %d missing datasets", deleted)
Esempio n. 8
0
def get_client(host="data.austintexas.gov", timeout=30):
    SOCRATA_APP_TOKEN = os.getenv("SOCRATA_APP_TOKEN")
    SOCRATA_API_KEY_ID = os.getenv("SOCRATA_API_KEY_ID")
    SOCRATA_API_KEY_SECRET = os.getenv("SOCRATA_API_KEY_SECRET")

    return sodapy.Socrata(
        host,
        SOCRATA_APP_TOKEN,
        username=SOCRATA_API_KEY_ID,
        password=SOCRATA_API_KEY_SECRET,
        timeout=timeout,
    )
Esempio n. 9
0
def main():
    log = structlog.get_logger()

    domain = os.environ['DOMAIN']
    app_token = os.environ['APP_TOKEN']
    dataset_id = os.environ['DATASET_ID']
    database_url = os.environ['DATABASE_URI']

    mongo_client = pymongo.MongoClient(database_url)
    client = sodapy.Socrata(domain, app_token)

    log.debug('Fetching count of dataset records')

    count = count_dataset(client, dataset_id)
    if count == None:
        log.error('Unable to query count of dataset')
        return 1

    log.info('Record count', count=count)

    db = mongo_client.permits
    index = db.all_permits

    log.debug('Fetching permits')

    for permit_set in fetch_permits(client, dataset_id, count):
        if permit_set == None:
            event = ('Excessive failures while fetching Socrata data, '
                     'exiting early')
            log.error(event)
            return 1

        log.debug('Cleaning permit set')

        permit_set = [clean_permit(permit) for permit in permit_set]

        try:
            result = index.insert_many(permit_set, ordered=False)
            insert_count = len(result.inserted_ids)
        except pymongo.errors.BulkWriteError as e:
            event = ('Error while bulk inserting data, '
                     'remaining documents will still be inserted')
            log.error(event, exc_info=True)
            insert_count = e.details.get('nInserted')
        finally:
            log.debug('bulk inserted permit data', count=insert_count)

    log.info('Finished fetching permits')

    log.debug('Total records in index table', count=index.count())

    return 0
    def execute(trial = False):
        '''Retrieve some data sets for the MongoDB collection.'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('chamathd', 'chamathd')

        # Neighborhood population data for the Boston area from collected data sources
        print("Fetching Boston population data from Data Mechanics resource")

        colName = "chamathd.neighborhood_pop_boston"

        url = "http://datamechanics.io/data/chamathd/boston_neighborhood_census.json"
        response = urllib.request.urlopen(url).read().decode("utf-8")
        
        r = json.loads(response)
        
        repo.dropCollection(colName)
        repo.createCollection(colName)

        print("Inserting JSON data into collection", colName)
        repo[colName].insert_many(r["neighborhoods"])
        print("Finished writing data to", colName)
        print()

        # Neighborhood population data for the Cambridge area from Cambridge Open Data
        print("Fetching Cambridge population data from Cambridge Open Data")

        colName = "chamathd.neighborhood_pop_cambridge"

        socrataClient = sodapy.Socrata("data.cambridgema.gov", None)
        response = socrataClient.get("vacj-bzri", limit=50)
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        
        repo.dropCollection(colName)
        repo.createCollection(colName)

        print("Inserting JSON data into collection", colName)
        repo[colName].insert_many(r)
        print("Finished writing data to", colName)
        print()

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start":startTime, "end":endTime}
def func(page_size, pages, token, database_id):
    print("Functiong working, boss!")
    api_url = 'data.cityofnewyork.us'
    client = sodapy.Socrata(api_url, token)
    database = database_id
    total = client.get(database, select='COUNT(*)')

    if pages < 0:
        pages = math.ceil(int(total[0]['COUNT']) / page_size)
        return (client.get(database, limit=total[0]['COUNT']))
    listobj = []
    for i in range(pages):
        listobj.append(
            client.get(database, limit=page_size, offset=i * page_size))
    return (listobj)
    def get_metadatas(socrata_table, identifier_column_name, domain_column_name):
        ids = socrata_table[identifier_column_name]
        domains = socrata_table[domain_column_name]

        metadatas = []
        for id, domain in zip(ids, domains):
            client = sodapy.Socrata(domain, '44TCQSoL2igJ2376fmSMcGkkh')
            try:
                metadata = client.get_metadata(id)
                metadata['domain'] = domain
                metadatas.append(metadata)
                print("Retrieved metadata for dataset: {}".format(str(id)))
            except:
                continue
        return metadatas
Esempio n. 13
0
def get_data(page_size, num_pages=None, output=None):

    client = sodapy.Socrata('data.cityofnewyork.us', os.environ['APP_KEY'])
    if num_pages == None:
        r = client.get('nc67-uf89', limit=page_size)
    else:
        r = client.get('nc67-uf89',
                       limit=page_size,
                       offset=(num_pages - 1) * page_size)

    if output == None:
        print(r)
    else:
        with open(output + '.txt', 'w') as outfile:
            json.dump(r, outfile)
def main(data_config, date_log, output_directory):
    inmates_client = sodapy.Socrata(data_config['service_url'],
                                    data_config['apptoken'],
                                    username=data_config['username'],
                                    password=data_config['password'])
    metadata = inmates_client.get_metadata(data_config['endpoint'])
    count = inmates_client.get(data_config['endpoint'],
                               query="select COUNT(*)")
    date = metadata["rowsUpdatedAt"]
    data = inmates_client.get(data_config['endpoint'],
                              content_type='json',
                              limit=count[0]['COUNT'])

    inmates_df = pd.io.json.json_normalize(data)
    inmates_df.to_feather(f'{output_directory}/{date}_inmates.feather')
    with open(date_log, 'a') as log:
        log.write(f'{date}\n')
    return ()
Esempio n. 15
0
def get_two_most_recent(id):
    """Query the Divvy API for the two most recent tuples
    of the specified station for prediction

    
    Args:
        id (int): Station ID in Divvy network
    
    Returns:
        pd.DataFrame: DF containing data pulled from API
    """
    client = sodapy.Socrata("data.cityofchicago.org", None)
    results = client.get("eq45-8inv", id=id, limit=2)
    results_df = pd.DataFrame.from_records(results)

    # convert numeric columns to numeric
    results_df[['id', 'percent_full', 'available_bikes'
                ]] = results_df[['id', 'percent_full',
                                 'available_bikes']].apply(pd.to_numeric,
                                                           errors='coerce')
    return results_df
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('jgrishey', 'jgrishey')

        client = sodapy.Socrata(
            "data.cambridgema.gov",
            dml.auth['services']['cityofcambridgedataportal']['token'])
        response = client.get("vnxa-cuyr", limit=500)

        tickets = []

        ID = 0

        for ticket in response:
            if 'location' in ticket:
                lat = ticket['location']['latitude']
                lon = ticket['location']['longitude']
                tickets.append({
                    '_id': ID,
                    'lat': lat,
                    'long': lon
                }) if (lat != '0' and lon != '0') else ()
                ID += 1

        repo.dropCollection("tickets")
        repo.createCollection("tickets")

        for ticket in tickets:
            repo['jgrishey.tickets'].insert(ticket)

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 17
0
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('jgrishey', 'jgrishey')

        client = sodapy.Socrata(
            "data.cityofboston.gov",
            dml.auth["services"]["cityofbostondataportal"]["token"])
        response = client.get("29yf-ye7n", limit=10000)

        crimes = []

        ID = 0

        for crime in response:
            lat = crime['location']['coordinates'][1]
            lon = crime['location']['coordinates'][0]
            crimes.append({
                '_id': ID,
                'lat': lat,
                'long': lon
            }) if (lat != '0' and lon != 0) else ()
            ID += 1

        repo.dropCollection("crime")
        repo.createCollection("crime")

        for crime in crimes:
            repo['jgrishey.crime'].insert(crime)

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 18
0
def main():
    logging.info("Starting...")
    REPO = {"id": 140626918, "name": "cityofaustin/atd-data-tech"}
    WORKSPACE_ID = "5caf7dc6ecad11531cc418ef"
    SOCRATA_RESOURCE_ID = "rzwg-fyv8"
    ZENHUB_ACCESS_TOKEN = os.environ["ZENHUB_ACCESS_TOKEN"]
    GITHUB_ACCESS_TOKEN = os.environ["GITHUB_ACCESS_TOKEN"]
    SOCRATA_API_KEY_ID = os.environ["SOCRATA_API_KEY_ID"]
    SOCRATA_API_KEY_SECRET = os.environ["SOCRATA_API_KEY_SECRET"]
    SOCRATA_APP_TOKEN = os.environ["SOCRATA_APP_TOKEN"]

    issues_gh = get_github_issues(REPO["name"], GITHUB_ACCESS_TOKEN)
    issues = [issue_to_dict(issue) for issue in issues_gh]
    convert_timestamps(issues)
    zenhub_metadata = get_zenhub_metadata(WORKSPACE_ID, ZENHUB_ACCESS_TOKEN,
                                          REPO["id"])
    zenhub_metadata_index = create_zenhub_metadata_index(zenhub_metadata)

    for issue in issues:
        zenhub_meta = zenhub_metadata_index.get(issue["number"])
        if zenhub_meta:
            issue.update(zenhub_meta)

        # set pipeline for closed issues, which have no pipeline metadata
        issue["pipeline"] = ("Closed" if issue["state"] == "closed" else
                             issue.get("pipeline"))

    client = sodapy.Socrata(
        "data.austintexas.gov",
        SOCRATA_APP_TOKEN,
        username=SOCRATA_API_KEY_ID,
        password=SOCRATA_API_KEY_SECRET,
        timeout=60,
    )

    for chunk in chunks(issues, 1000):
        client.upsert(SOCRATA_RESOURCE_ID, issues)
        logging.info(f"{len(chunk)} processed")
Esempio n. 19
0
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('pt0713_silnuext', 'pt0713_silnuext')

        client = sodapy.Socrata("data.mass.gov", None)
        response = client.get("x99p-b88k", limit=9787, offset=0)
        s = json.dumps(response, sort_keys=True, indent=2)
        repo.dropCollection("fld")
        repo.createCollection("fld")
        repo['pt0713_silnuext.fld'].insert_many(response)
        repo['pt0713_silnuext.fld'].metadata({'complete': True})
        print(repo['pt0713_silnuext.fld'].metadata())

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 20
0
    def execute(trial=False):
        startTime = datetime.datetime.now()
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate("rengx_ztwu_lwj", "rengx_ztwu_lwj")
        market = repo.market
        market_find = market.find()
        market_data = []
        for i in market_find:
            market_data.append(i)

        ss = sodapy.Socrata("data.cityofboston.gov",
                            "x92LG4iaFto5qWQGFk3lDdv6p",
                            username="******",
                            password="******")
        response = ss.get("u6fv-m8v4")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        hospital_data = r  #26
        response = ss.get("rdqf-ter7")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        garden_data = r
        #184

        # get Police Station data in Boston
        response = ss.get("pyxn-r3i2")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        police_data = r
        accessdb = union.unionF(hospital_data, garden_data, market_data,
                                police_data)
        repo.access.drop()
        access = repo.access
        access.insert_many(accessdb)
        repo.logout()
        endTime = datetime.datetime.now()
        #print("union complete")
        return {"start": startTime, "end": endTime}
Esempio n. 21
0
    def __init__(self, source, dataset_id, n_types, tz, hrs, max_hrs):
        self.source = source
        self.dataset_id = dataset_id
        self.n_types = n_types
        self.tz = pytz.timezone(tz)
        self.hrs = hrs
        self.max_hrs = max_hrs

        # Preparing Socrata client
        self.client = sodapy.Socrata(source, None)

        # Preparing containers
        self.data = pd.DataFrame(columns=SocrataProvider.COLS)
        self.data_ds = ColumnDataSource(
            data={cl: []
                  for cl in SocrataProvider.COLS})
        self.data_view = CDSView(filters=[], source=self.data_ds)
        self.type_stats_ds = ColumnDataSource(data={"type": [], "counts": []})
        self.dispatch_types = []

        # Calculating start time for inital data fetch
        self.start_time = dt.datetime.now(
            self.tz) - pd.Timedelta(hours=max_hrs)
        self.fetch_data()
Esempio n. 22
0
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('bohan_nyx_xh1994_yiran123',
                          'bohan_nyx_xh1994_yiran123')

        # city of boston crime incident July 2012 - August 2015
        #url = 'https://data.cityofboston.gov/resource/ufcx-3fdn.json'
        #response_crime = urllib.request.urlopen(url).read().decode("utf-8")
        client = sodapy.Socrata(
            "data.cityofboston.gov",
            dml.auth['services']['cityofbostondataportal']['token'])
        r = client.get("29yf-ye7n", limit=3000)
        #r = json.loads(response_crime)
        #s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("crime_boston")
        repo.createCollection("crime_boston")
        repo['bohan_nyx_xh1994_yiran123.crime_boston'].insert_many(r)
        repo['bohan_nyx_xh1994_yiran123.crime_boston'].metadata(
            {'complete': True})
        #print(repo['bohan_nyx_xh1994_yiran123.crime_boston'].metadata())

        #city of boston property assessment 2014
        '''url_property = 'https://data.cityofboston.gov/resource/jsri-cpsq.json'
        response_property = urllib.request.urlopen(url_property).read().decode("utf-8")
        r = json.loads(response_property)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("property_boston_assessment_2014")
        repo.createCollection("property_boston_assessment_2014")
        repo['bohan_nyx_xh1994_yiran123.property_boston_assessment_2014'].insert_many(r)'''

        #Food Establishment Inspections
        #url_foodIE = 'https://data.cityofboston.gov/resource/427a-3cn5.json'
        #response_foodIE = urllib.request.urlopen(url_foodIE).read().decode("utf-8")
        client = sodapy.Socrata(
            "data.cityofboston.gov",
            dml.auth['services']['cityofbostondataportal']['token'])
        r = client.get("427a-3cn5", limit=467558)  #467558
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("Food_Establishment_Inspections")
        repo.createCollection("Food_Establishment_Inspections")
        repo[
            'bohan_nyx_xh1994_yiran123.Food_Establishment_Inspections'].insert_many(
                r)

        #https://data.cityofboston.gov/resource/fdxy-gydq.json
        client = sodapy.Socrata(
            "data.cityofboston.gov",
            dml.auth['services']['cityofbostondataportal']['token'])
        r = client.get("fdxy-gydq", limit=3000)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("Active_Food_Establishment_Licenses")
        repo.createCollection("Active_Food_Establishment_Licenses")
        repo[
            'bohan_nyx_xh1994_yiran123.Active_Food_Establishment_Licenses'].insert_many(
                r)

        #entertainment Licenses
        #url_entertainmentL = 'https://data.cityofboston.gov/resource/cz6t-w69j.json'
        #response_entertainmentL = urllib.request.urlopen(url_entertainmentL).read().decode("utf-8")
        client = sodapy.Socrata(
            "data.cityofboston.gov",
            dml.auth['services']['cityofbostondataportal']['token'])
        r = client.get("cz6t-w69j", limit=5223)  #5223
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("Entertainment_Licenses")
        repo.createCollection("Entertainment_Licenses")
        repo['bohan_nyx_xh1994_yiran123.Entertainment_Licenses'].insert_many(r)

        #mbta stops location
        '''url_stopbylocation = 'http://realtime.mbta.com/developer/api/v2/stopsbylocation'#?api_key=wX9NwuHnZU2ToO7GmGR9uw&lat=42.346961&lon=-71.076640&format=json'
        api_key_mbta = 'wX9NwuHnZU2ToO7GmGR9uw'
        mbta_api_key = dml.auth['services']['mbtadeveloperportal']['key']
        response_stopbylocation = requests.get(url_stopbylocation + '?api_key=' + mbta_api_key + '&route=' + route)
        r = json.loads(response_stopbylocation)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("mbta_stop_by_location")
        repo.createCollection("mbta_stop_by_location")
        repo['bohan_nyx_xh1994_yiran123.mbta_stop_by_location'].insert_many(r)'''
        #the lat and long is [4]and[5],.....stopsbylocation return a list of the stops nearest a particular location

        #TRAFFIC SIGNALS
        '''url_traffic_signal = 'http://bostonopendata-boston.opendata.arcgis.com/datasets/de08c6fe69c942509089e6db98c716a3_0.geojson'
        response_traffic_signal = urllib.request.urlopen(url_traffic_signal).read().decode("utf-8")
        r = json.loads(response_traffic_signal)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("TRAFFIC_SIGNALS")
        repo.createCollection("TRAFFIC_SIGNALS")
        repo['bohan_nyx_xh1994_yiran123.TRAFFIC_SIGNALS'].insert(r)'''

        url_airbnb = 'http://datamechanics.io/data/bohan_xh1994/airbnb.json'
        response_airbnb_rating = urllib.request.urlopen(
            url_airbnb).read().decode("utf-8")
        r = json.loads(response_airbnb_rating)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("airbnb_rating")
        repo.createCollection("airbnb_rating")
        repo['bohan_nyx_xh1994_yiran123.airbnb_rating'].insert(r)

        url_MBTA_Bus_stops = 'http://datamechanics.io/data/wuhaoyu_yiran123/MBTA_Bus_Stops.geojson'
        response_MBTA_Bus_stops = urllib.request.urlopen(
            url_MBTA_Bus_stops).read().decode("utf-8")
        r = json.loads(response_MBTA_Bus_stops)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("MBTA_Bus_stops")
        repo.createCollection("MBTA_Bus_stops")
        repo['bohan_nyx_xh1994_yiran123.MBTA_Bus_stops'].insert(r)
        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 23
0
#!/usr/bin/env python

'Create table of number of trips from station I to station J'

import sodapy
import pandas as pd


data_url = 'data.cityofchicago.org'
trip_data = 'fg6s-gzvg'
station_data = 'aavc-b2wj'

# Open Socrata resource
data = sodapy.Socrata(data_url, None)

# Get station information
select = 'id, station_name, total_docks, latitude, longitude'
stations_json = data.get(station_data, select=select,
                         limit=1000, order='id ASC')

# Convert to DataFrame and convert columns to numeric types
stations = pd.DataFrame(stations_json)
cols_convert = ['id', 'total_docks', 'latitude', 'longitude']
stations[cols_convert] = stations[cols_convert].apply(pd.to_numeric)
stations.set_index('id', inplace=True)

print 'Retrieved {0:d} station records'.format(len(stations))

stations.to_csv('stations.csv')

Esempio n. 24
0
    def execute(trial=False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('jw0208', "jw0208")  #username, password
        ## done----------------------------------------------------------
        url = 'http://datamechanics.io/data/jw0208/medicare.json'
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("medicare")
        repo.createCollection("medicare")
        repo['jw0208.medicare'].insert_many(r)
        repo['jw0208.medicare'].metadata({'complete': True})
        #print(json.dumps(s, sort_keys=True, indent=2))

        ## done----------------------------------------------------------
        url = 'http://datamechanics.io/data/jw0208/poverty.json'
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("poverty")
        repo.createCollection("poverty")
        repo['jw0208.poverty'].insert_many(r)
        repo['jw0208.poverty'].metadata({'complete': True})
        #print(json.dumps(s, sort_keys=True, indent=2))

        # ## done----------------------------------------------------------
        url = 'http://datamechanics.io/data/jw0208/education.json'
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("education")
        repo.createCollection("education")
        repo['jw0208.education'].insert_many(r)
        repo['jw0208.education'].metadata({'complete': True})
        #print(repo['jw0208.education'].metadata())
        #print(json.dumps(s, sort_keys=True, indent=2))
        #         with open('education.json') as r:
        #             s = json.loads(r.read())
        #         repo.dropCollection("education")
        #         repo.createCollection("education")
        #         repo['jw0208.education'].insert_many(s)
        #         repo['jw0208.education'].metadata({'complete': True})
        #         # print(json.dumps(s, sort_keys=True, indent=2))

        ## done----------------------------------------------------------
        url = 'http://datamechanics.io/data/jw0208/income.json'
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("income")
        repo.createCollection("income")
        repo['jw0208.income'].insert_many(r)
        repo['jw0208.income'].metadata({'complete': True})
        # print(json.dumps(s, sort_keys=True, indent=2))

        ## ----------------------------------------------------------
        client = sodapy.Socrata("chronicdata.cdc.gov", None)
        response = client.get("fq5d-abxc", limit=52)
        r = json.loads(
            json.dumps(response))  #load twice to convert list to str
        s = json.dumps(r, sort_keys=True, indent=2)
        repo.dropCollection("health")
        repo.createCollection("health")
        repo['jw0208.health'].insert_many(r)
        repo['jw0208.health'].metadata({'complete': True})
        #print(json.dumps(response, sort_keys=True, indent=2))

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
Esempio n. 25
0
    def execute(trial = False):
        '''Retrieve some data sets (not using the API here for the sake of simplicity).'''
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('pt0713_silnuext', 'pt0713_silnuext')
        repo.dropCollection("police_crime")
        repo.createCollection("police_crime")



        # import police districts data
        url = "http://bostonopendata-boston.opendata.arcgis.com/datasets/9a3a8c427add450eaf45a470245680fc_5.geojson"
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        r = [r['features'][i]['properties'] for i in range(11)]

        # getting DISTRICT column from police_district dataset
        districts = project(r, lambda t: (t['DISTRICT']))
        print(districts)

        repo['pt0713_silnuext.police_crime'].insert_many(r)
        repo['pt0713_silnuext.police_crime'].metadata({'complete':True})
        print(repo['pt0713_silnuext.police_crime'].metadata())




        # import crime data      
        client1 = sodapy.Socrata("data.cityofboston.gov", None)
        response1 = []
        limits = [0, 50001, 100001, 150001, 200001, 250001]
        for limit in limits:
            response1 += client1.get("crime", limit=50000, offset=limit)
        s = json.dumps(response1, sort_keys=True, indent=2)

        # getting DISTRICT column from crime dataset
        crime_district = project(response1,lambda t:([t.get("reptdistrict")]))

        # if a crime happens in police_district: count+1
        crime_in_police_district = 0
        for district in crime_district:
            if district[0] in districts:
                crime_in_police_district += 1

        print(crime_in_police_district)

        # count the final percentage of crime happens in police district / all of the crimes happen
        percentage_crime_in_police_district = crime_in_police_district / len(crime_district)
        print("The percentage of crime happens in police district is: ", percentage_crime_in_police_district)
        print("Thus, our assumption about crime happens less in police districts is wrong.")

        repo['pt0713_silnuext.police_crime'].insert_many(response1)
        repo['pt0713_silnuext.police_crime'].metadata({'complete':True})
        print(repo['pt0713_silnuext.police_crime'].metadata())
        

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start":startTime, "end":endTime}
# minimum value for this clause is 1894
clause = "year > '{}'".format(minyear) + monthstring

print(clause)


# ## Step 9. 
# **Using sodapy library (recommended by CDC) get the drought data from CDC API.** set limit to something large so we get all records (if excluded will only get 1000).

# In[20]:


# this gets the pdsi data from cdc website using the query speciried in the get() function

client = sodapy.Socrata("data.cdc.gov", None)
results = client.get("en5r-5ds4", where=clause, limit = 10000000)
pdsi = pd.DataFrame.from_records(results)

client.close()


# In[21]:


print(pdsi.head(5))
print(len(pdsi))


# ## Step 10.
# **change datatypes of all columns.**
                group="hour")

    df_r = pd.DataFrame.from_records(r).astype('int').set_index('hour')

    df_T = pd.DataFrame(df_r.transpose().to_dict())
    df_T.index = [stat_id]

    return df_T


data_url = 'data.cityofchicago.org'
app_token = '6xUrAQkGuIctuyUrTEdLZZFRY'
d_trips = 'fg6s-gzvg'
d_stats = 'aavc-b2wj'

api = sodapy.Socrata(data_url, app_token)

if not os.path.exists('stations_trip_counts.csv'):

    r = api.get(d_stats)

    cols = [
        'id', 'address', 'docks_in_service', 'latitude', 'longitude',
        'station_name', 'total_docks'
    ]
    df_stats = pd.DataFrame.from_records(r, columns=cols)

    # Convert data types
    df_stats[['id', 'docks_in_service', 'total_docks']] =\
        df_stats[['id', 'docks_in_service', 'total_docks']].astype(int)
 def get_dataset_descriptors(domain, token):
     client = sodapy.Socrata(domain, token)
     return client.datasets()
Esempio n. 29
0
    def execute(trial=False):
        startTime = datetime.datetime.now()

        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('nyx', 'nyx')

        #Landmarks dataset
        # url = 'https://data.cityofboston.gov/resource/u6fv-m8v4.json'
        client = sodapy.Socrata("data.cityofboston.gov", None)
        # response = urllib.request.urlopen(url).read().decode("utf-8")
        r1 = client.get("u6fv-m8v4", limit=10)
        s = json.dumps(r1, sort_keys=True, indent=2)
        repo.dropCollection("landmarks")
        repo.createCollection("landmarks")
        repo['nyx.landmarks'].insert_many(r1)
        repo['nyx.landmarks'].metadata({'complete': True})
        print(repo['nyx.landmarks'].metadata())

        #Farmer markets
        client = sodapy.Socrata("data.mass.gov", None)
        #url = 'https://data.mass.gov/resource/66t5-f563.json'
        #response = urllib.request.urlopen(url).read().decode("utf-8")
        r2 = client.get("66t5-f563", limit=10)
        #r2 = json.loads(response)
        s = json.dumps(r2, sort_keys=True, indent=2)
        repo.dropCollection("fmarket")
        repo.createCollection("fmarket")
        repo['nyx.fmarket'].insert_many(r1)

        #School Gardens
        #url = 'https://data.cityofboston.gov/resource/pzcy-jpz4.json'
        client = sodapy.Socrata("data.cityofboston.gov", None)
        #response = urllib.request.urlopen(url).read().decode("utf-8")
        r3 = client.get("pzcy-jpz4", limit=10)
        s = json.dumps(r3, sort_keys=True, indent=2)
        repo.dropCollection("fmarket")
        repo.createCollection("fmarket")
        repo['nyx.fmarket'].insert_many(r2)

        #Crime dataset
        #url = 'https://data.cityofboston.gov/resource/29yf-ye7n.json'
        client = sodapy.Socrata("data.cityofboston.gov", None)
        #response = urllib.request.urlopen(url).read().decode("utf-8")
        r4 = client.get("29yf-ye7n", limit=10)
        s = json.dumps(r4, sort_keys=True, indent=2)
        repo.dropCollection("crime")
        repo.createCollection("crime")
        repo['nyx.crime'].insert_many(r3)

        #Parking Zone
        #url = 'https://data.cityofboston.gov/resource/gdnf-7hki.json'
        client = sodapy.Socrata("data.cityofboston.gov", None)
        #response = urllib.request.urlopen(url).read().decode("utf-8")
        r5 = client.get("gdnf-7hki", limit=10)
        s = json.dumps(r5, sort_keys=True, indent=2)
        repo.dropCollection("parking")
        repo.createCollection("parking")
        repo['nyx.parking'].insert_many(r4)

        #Police Station
        #url = 'https://data.cityofboston.gov/resource/pyxn-r3i2.json'
        client = sodapy.Socrata("data.cityofboston.gov", None)
        #response = urllib.request.urlopen(url).read().decode("utf-8")
        r6 = client.get("pyxn-r3i2", limit=10)
        s = json.dumps(r6, sort_keys=True, indent=2)
        repo.dropCollection("police")
        repo.createCollection("police")
        repo['nyx.police'].insert_many(r5)

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}
    def execute(trial=False):
        startTime = datetime.datetime.now()

        # Set up the database connection.
        client = dml.pymongo.MongoClient()
        repo = client.repo
        repo.authenticate('billy108_zhou13', 'billy108_zhou13')

        # get Seasonal Swimming pools data in Boston
        client = sodapy.Socrata("data.cityofboston.gov", None)
        response = client.get("xw3e-c7pz")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        repo.dropCollection("seasonalSwimPools")
        repo.createCollection("seasonalSwimPools")
        repo['billy108_zhou13.seasonalSwimPools'].insert_many(r)
        repo['billy108_zhou13.seasonalSwimPools'].metadata({'complete': True})
        print(repo['billy108_zhou13.seasonalSwimPools'].metadata())

        # get Community Gardens data in Boston
        client = sodapy.Socrata("data.cityofboston.gov", None)
        response = client.get("rdqf-ter7")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        repo.dropCollection("communityGardens")
        repo.createCollection("communityGardens")
        repo['billy108_zhou13.communityGardens'].insert_many(r)
        repo['billy108_zhou13.communityGardens'].metadata({'complete': True})
        print(repo['billy108_zhou13.communityGardens'].metadata())

        # get recreational open space data in Cambridge
        client = sodapy.Socrata("data.cambridgema.gov", None)
        response = client.get("5ctr-ccas")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        repo.dropCollection("openSpaceCambridge")
        repo.createCollection("openSpaceCambridge")
        repo['billy108_zhou13.openSpaceCambridge'].insert_many(r)
        repo['billy108_zhou13.openSpaceCambridge'].metadata({'complete': True})
        print(repo['billy108_zhou13.openSpaceCambridge'].metadata())

        # get recreational waterplay parks data in Cambridge
        client = sodapy.Socrata("data.cambridgema.gov", None)
        response = client.get("hv2t-vv6d")
        r = json.loads(json.dumps(response, sort_keys=True, indent=2))
        repo.dropCollection("waterplayCambridge")
        repo.createCollection("waterplayCambridge")
        repo['billy108_zhou13.waterplayCambridge'].insert_many(r)
        repo['billy108_zhou13.waterplayCambridge'].metadata({'complete': True})
        print(repo['billy108_zhou13.waterplayCambridge'].metadata())

        # Get data of Open spaces of conservation and recreation interest in Boston
        url = 'http://bostonopendata-boston.opendata.arcgis.com/datasets/2868d370c55d4d458d4ae2224ef8cddd_7.geojson'
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        repo.dropCollection("openSpaceBoston")
        repo.createCollection("openSpaceBoston")
        repo['billy108_zhou13.openSpaceBoston'].insert_many(r['features'])
        repo['billy108_zhou13.openSpaceBoston'].metadata({'complete': True})
        print(repo['billy108_zhou13.openSpaceBoston'].metadata())

        # Get data of Community Center Pools in Boston
        url = 'http://bostonopendata-boston.opendata.arcgis.com/datasets/5575f763dbb64effa36acd67085ef3a8_0.geojson'
        response = urllib.request.urlopen(url).read().decode("utf-8")
        r = json.loads(response)
        repo.dropCollection("commCenterPools")
        repo.createCollection("commCenterPools")
        repo['billy108_zhou13.commCenterPools'].insert_many(r['features'])
        repo['billy108_zhou13.commCenterPools'].metadata({'complete': True})
        print(repo['billy108_zhou13.commCenterPools'].metadata())

        repo.logout()

        endTime = datetime.datetime.now()

        return {"start": startTime, "end": endTime}