コード例 #1
0
class SocrataClient:
    def __init__(self):
        self.client = Socrata(conf.DOMAIN, conf.TOKEN, timeout=conf.TIMEOUT)

    def __del__(self):
        self.client.close()

    def dataset_id(self, year):
        return conf.DATASET_IDS[year]

    def get(self, year, **kwargs):
        id = self.dataset_id(year)
        for attempt in range(conf.ATTEMPTS):
            try:
                return self.client.get(id, **kwargs)
            except Exception as e:
                if attempt < conf.ATTEMPTS - 1:
                    continue
                else:
                    raise e

    def get_metadata(self, year):
        id = self.dataset_id(year)
        return self.client.get_metadata(id)

    def get_datasets(self):
        '''
        Search for "MyLA311 Service Request Data" within the response
        to get the dataset ids for each year.
        '''
        return self.client.datasets()
コード例 #2
0
    def _create_foreign_tables(self, schema: str, server_id: str,
                               tables: TableInfo) -> List[MountError]:
        from sodapy import Socrata
        from psycopg2.sql import SQL

        logging.info("Getting Socrata metadata")
        client = Socrata(domain=self.params["domain"],
                         app_token=self.credentials.get("app_token"))

        tables = self.tables or tables
        if isinstance(tables, list):
            sought_ids = tables
        else:
            sought_ids = [t[1]["socrata_id"] for t in tables.values()]

        try:
            datasets = client.datasets(ids=sought_ids, only=["dataset"])
        except Exception as e:
            if "Unknown response format: text/html" in str(e):
                # If the Socrata dataset/domain isn't found, sodapy doesn't catch it directly
                # and instead stumbles on an unexpected content-type of the 404 page it's served.
                # We catch that and reraise a more friendly message.
                raise RepositoryNotFoundError(
                    "Socrata domain or dataset not found!") from e
            raise

        if not datasets:
            raise RepositoryNotFoundError(
                "Socrata domain or dataset not found!")

        mount_statements, mount_args = generate_socrata_mount_queries(
            sought_ids, datasets, schema, server_id, tables)

        self.engine.run_sql(SQL(";").join(mount_statements), mount_args)
        return []
コード例 #3
0
def socrata_metadata_excel(socrata_dataset, authentication_token):
    print(socrata_dataset.info())
    socrata = remove_characters(socrata_dataset)
    list_datasets = []
    for index, row in socrata.iterrows():
        website_link = getattr(row, 'website_url')
        print("Link:", website_link)
        client = Socrata(website_link, authentication_token)
        print(getattr(row, 'city'))
        dataframe = pd.DataFrame(client.datasets())
        resource = dataframe['resource'].apply(pd.Series)
        print("Resource Columns:",resource.columns)
        classification = dataframe['classification'].apply(pd.Series)
        print("\n Classification Columns:",classification.columns)
        link = dataframe['link'].apply(pd.Series)
        print("\n Links Columns:", link.columns)
        metadata = dataframe['metadata'].apply(pd.Series)
        print("\n Metadata Columns:", metadata.columns)
        owner = dataframe['owner'].apply(pd.Series)
        print("\n Owner Columns:",owner.columns )
        permalink = dataframe['permalink'].apply(pd.Series)
        print("\n Permalink Columns:", permalink.columns)
        print(" All Function are working")
        final_dataset = pd.concat([resource, classification, link, metadata, owner, permalink], axis=1)
        list_datasets.append(final_dataset)
    excel_dataset = pd.concat(list_datasets, axis=0)
    return excel_dataset
コード例 #4
0
def test_get_datasets():
    mock_adapter = {}
    mock_adapter["prefix"] = PREFIX
    adapter = requests_mock.Adapter()
    mock_adapter["adapter"] = adapter
    client = Socrata(DOMAIN, APPTOKEN, session_adapter=mock_adapter)

    setup_datasets_mock(adapter,
                        "get_datasets.txt",
                        200,
                        params={"limit": "7"})
    response = client.datasets(limit=7)

    assert isinstance(response, list)
    assert len(response) == 7
コード例 #5
0
class SocrataClient:
    def __init__(self):
        conf = config['Socrata']

        domain = conf['DOMAIN']
        token = conf['TOKEN']
        timeout = conf['TIMEOUT']

        self.client = Socrata(domain, token, timeout=timeout)
        self.attempts = conf['ATTEMPTS']
        self.years = conf

    def __del__(self):
        self.client.close()

    def dataset_id(self, year):
        return self.years['AP' + str(year)]

    def get(self, year, **kwargs):
        id = self.dataset_id(year)
        for attempt in range(self.attempts):
            try:
                return self.client.get(id, **kwargs)
            except Exception as e:
                if attempt < self.attempts - 1:
                    continue
                else:
                    raise e

    def get_metadata(self, year):
        id = self.dataset_id(year)
        return self.client.get_metadata(id)

    def get_datasets(self):
        '''
        Search for "MyLA311 Service Request Data" within the response
        to get the dataset ids for each year.
        '''
        return self.client.datasets()
コード例 #6
0
def initialize_socrata(data_portal_url, app_token=None) -> tuple:
    client = Socrata(data_portal_url, app_token)
    ds = client.datasets()
    return client, ds
コード例 #7
0
        pdb.set_trace()
        df.append(line)

# Load initial json data
with open(DATA_FILE, 'r') as f:
    d = f.readline(1000)

# Get json data
client = Socrata(DATA_DOMAIN, APP_TOKEN)
results = client.get(DATA_ID, limit=10000)

# Load json into pandas dataframe
df = pd.DataFrame(results)
df.head()

# Get list of all datasets
chi_datasets = client.datasets()

# Get set of categories
categories = [
    i for k, v in enumerate(chi_datasets, 0)
    for i in chi_datasets[k]['classification']['categories']
]
categories = set(categories)

d_list = list()
with open('test.json', 'r') as f:
    businesses = ijson.items(f, 'item')
    for business in businesses:
        d_list.append(business)
コード例 #8
0
ファイル: mount.py プロジェクト: yanyu510/splitgraph
def mount_socrata(
    mountpoint: str,
    server,
    port,
    username,
    password,
    domain: str,
    tables: Optional[Dict[str, Any]] = None,
    app_token: Optional[str] = None,
    batch_size: Optional[int] = 10000,
) -> None:
    """
    Mount a Socrata dataset.

    Mounts a remote Socrata dataset and forwards queries to it
    \b

    :param domain: Socrata domain, for example, data.albanyny.gov. Required.
    :param tables: A dictionary mapping PostgreSQL table names to Socrata table IDs. For example,
        {"salaries": "xzkq-xp2w"}. If skipped, ALL tables in the Socrata endpoint will be mounted.
    :param app_token: Socrata app token. Optional.
    :param batch_size: Amount of rows to fetch from Socrata per request (limit parameter). Maximum 50000.
    """
    from splitgraph.engine import get_engine
    from sodapy import Socrata
    from psycopg2.sql import Identifier, SQL

    engine = get_engine()
    logging.info("Mounting Socrata domain...")
    server_id = mountpoint + "_server"

    options: Dict[str, Optional[str]] = {
        "wrapper": "splitgraph.ingestion.socrata.fdw.SocrataForeignDataWrapper",
    }

    if domain:
        options["domain"] = domain
    if app_token:
        options["app_token"] = app_token
    if batch_size:
        options["batch_size"] = str(batch_size)

    init_fdw(
        engine, server_id=server_id, wrapper="multicorn", server_options=options,
    )

    engine.run_sql(SQL("CREATE SCHEMA IF NOT EXISTS {}").format(Identifier(mountpoint)))

    logging.info("Getting Socrata metadata")
    client = Socrata(domain=domain, app_token=app_token)
    sought_ids = tables.values() if tables else []

    try:
        datasets = client.datasets(ids=sought_ids, only=["dataset"])
    except Exception as e:
        if "Unknown response format: text/html" in str(e):
            # If the Socrata dataset/domain isn't found, sodapy doesn't catch it directly
            # and instead stumbles on an unexpected content-type of the 404 page it's served.
            # We catch that and reraise a more friendly message.
            raise RepositoryNotFoundError("Socrata domain or dataset not found!") from e

    mount_statements, mount_args = generate_socrata_mount_queries(
        sought_ids, datasets, mountpoint, server_id, tables
    )

    engine.run_sql(SQL(";").join(mount_statements), mount_args)
コード例 #9
0
ファイル: try.py プロジェクト: dhairyachandra/OCEL.AI
                    'contentUrl': contentUrl})


apiSecret = '924eeda9ef2b4b5d9d41013ffdb7cd30'
from newsapi import NewsApiClient

# Init
newsapi = NewsApiClient(api_key=apiSecret)

# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
                                          sources='bbc-news,the-verge',
                                          category='business',
                                          language='en',
                                          country='us')

# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin',
                                      language='en',
                                      sort_by='relevancy',
                                      page=1)
text = 'camp'
response = requests.get('http://catalog.data.gov/api/3/action/package_search?q=' +text+ '&rows=50')
response_dict = json.loads(response.content)'''

from sodapy import Socrata

client = Socrata("data.kcmo.org", None)

client.datasets()
コード例 #10
0
class SocrataRepository(HarvestRepository):
    """ Socrata Repository """

    def setRepoParams(self, repoParams):
        self.metadataprefix = "socrata"
        super(SocrataRepository, self).setRepoParams(repoParams)
        # sodapy doesn't like http/https preceding URLs
        self.socratarepo = Socrata(self.url, self.socrata_app_token)
        self.domain_metadata = []


    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "socrata", 
            "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url,
            "repo_oai_name": self.repo_oai_name
        }
        self.repository_id = self.db.update_repo(**kwargs)
        records = self.socratarepo.datasets()

        item_count = 0
        for rec in records:
            result = self.db.write_header(rec["resource"]["id"], self.repository_id)
            item_count = item_count + 1
            if (item_count % self.update_log_after_numitems == 0):
                tdelta = time.time() - self.tstart + 0.1
                self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize(tdelta), item_count/tdelta) )

        self.logger.info("Found {} items in feed".format(item_count) )

    def format_socrata_to_oai(self, socrata_record, local_identifier):
        record = {}

        record["title"] = socrata_record.get("name","").strip()
        record["description"] = socrata_record.get("description", "")
        record["tags"] = socrata_record.get("tags", "")
        record["identifier"] = local_identifier
        record["creator"] = socrata_record.get("attribution", self.name)
        record["pub_date"] = datetime.fromtimestamp(socrata_record["publicationDate"]).strftime('%Y-%m-%d')
        record["subject"] = socrata_record.get("category", "")
        record["title_fr"] = ""
        record["series"] = ""
        record["rights"] = []

        if ('license' in socrata_record) and socrata_record['license']:
            # Winnipeg, Nova Scotia, PEI
            record["rights"].append(socrata_record['license'].get("name", ""))
            record["rights"].append(socrata_record['license'].get("termsLink", ""))
            record["rights"] = "\n".join(record["rights"])
            record["rights"] = record["rights"].strip()

        if record["rights"] == "See Terms of Use":
            # Calgary, Edmonton
            record["rights"] = []

        if ('metadata' in socrata_record) and socrata_record['metadata']:
            if ('custom_fields' in socrata_record['metadata']) and socrata_record['metadata']['custom_fields']:
                if ('License/Attribution' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['License/Attribution']:
                    if ('License URL' in socrata_record['metadata']['custom_fields']['License/Attribution'] and socrata_record['metadata']['custom_fields']['License/Attribution']['License URL']):
                        if record["rights"] == "" or record["rights"] == []:
                            # Calgary
                            record["rights"] = socrata_record['metadata']['custom_fields']['License/Attribution']['License URL']
                    if ('License-URL' in socrata_record['metadata']['custom_fields']['License/Attribution'] and socrata_record['metadata']['custom_fields']['License/Attribution']['License-URL']):
                        if record["rights"] == "" or record["rights"] == []:
                            # Calgary
                            record["rights"] = socrata_record['metadata']['custom_fields']['License/Attribution']['License-URL']
                elif ('Licence' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['Licence']:
                    if ('Licence' in socrata_record['metadata']['custom_fields']['Licence']) and socrata_record['metadata']['custom_fields']['Licence']['Licence']:
                        if record["rights"] == "" or record["rights"] == []:
                            # Winnipeg
                            record["rights"] = socrata_record['metadata']['custom_fields']['Licence']['Licence']
                elif ('Attributes' in socrata_record['metadata']['custom_fields']) and socrata_record['metadata']['custom_fields']['Attributes']:
                    if ('Licence' in socrata_record['metadata']['custom_fields']['Attributes']) and socrata_record['metadata']['custom_fields']['Attributes']['Licence']:
                        if record["rights"] == "" or record["rights"] == []:
                            # Strathcona
                            record["rights"] = socrata_record['metadata']['custom_fields']['Attributes']['Licence']
        if record["rights"] == "" or record["rights"] == []:
            record.pop("rights")


        # Continue to default to English for our current Socrata repositories.
        # For Nova Scoatia, "fra" language refers to the dataset, not the metadata.
        
        # language = self.default_language
        # if "metadata" in socrata_record:
        #     if "custom_fields" in socrata_record["metadata"]:
        #         if "Detailed Metadata" in socrata_record["metadata"]["custom_fields"]:
        #             if "Language" in socrata_record["metadata"]["custom_fields"]["Detailed Metadata"]:
        #                 # Nova Scotia
        #                 language = socrata_record["metadata"]["custom_fields"]["Detailed Metadata"]["Language"]
        #         elif "Dataset Information" in socrata_record["metadata"]["custom_fields"]:
        #             if "Language" in socrata_record["metadata"]["custom_fields"]["Dataset Information"]:
        #                 # Prince Edward Island
        #                 language = socrata_record["metadata"]["custom_fields"]["Dataset Information"]["Language"]
        # language = language.lower()
        #
        # if language in ["fr", "fre", "fra", "french"]:
        #     language = "fr"
        # else:
        #     language = "en"

        return record


    @rate_limited(5)
    def _update_record(self,record):

        try:            
            socrata_record = self.socratarepo.get_metadata(record['local_identifier'])
            oai_record = self.format_socrata_to_oai(socrata_record,record['local_identifier'])
            if oai_record:
                self.db.write_record(oai_record, self)
            return True

        except Exception as e:
            self.logger.error("Updating record {} failed: {}".format(record['local_identifier'], e))
            if self.dump_on_failure == True:
                try:
                    print(socrata_record)
                except:
                    pass
            # Touch the record so we do not keep requesting it on every run
            self.db.touch_record(record)
            self.error_count = self.error_count + 1
            if self.error_count < self.abort_after_numerrors:
                return True

        return False
コード例 #11
0
class SocrataRepository(HarvestRepository):
    """ Socrata Repository """

    def setRepoParams(self, repoParams):
        self.metadataprefix = "socrata"
        super(SocrataRepository, self).setRepoParams(repoParams)
        # sodapy doesn't like http/https preceding URLs
        self.socratarepo = Socrata(self.url, self.socrata_app_token)
        self.domain_metadata = []


    def _crawl(self):
        kwargs = {
            "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "socrata", 
            "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern,
            "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run,
            "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days,
            "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url
        }
        self.repository_id = self.db.update_repo(**kwargs)
        records = self.socratarepo.datasets()

        item_count = 0
        for rec in records:
            result = self.db.write_header(rec["resource"]["id"], self.repository_id)
            item_count = item_count + 1
            if (item_count % self.update_log_after_numitems == 0):
                tdelta = time.time() - self.tstart + 0.1
                self.logger.info("Done {} item headers after {} ({:.1f} items/sec)".format(item_count, self.formatter.humanize(tdelta), item_count/tdelta) )

        self.logger.info("Found {} items in feed".format(item_count) )

    def format_socrata_to_oai(self, socrata_record, local_identifier):
        record = {}

        record["title"] = socrata_record["name"]
        record["description"] = socrata_record.get("description", "")
        record["tags"] = socrata_record.get("tags", "")
        record["identifier"] = local_identifier
        record["creator"] = socrata_record.get("attribution", self.name)
        record["pub_date"] = datetime.fromtimestamp(socrata_record["publicationDate"]).strftime('%Y-%m-%d')
        record["contact"] = self.contact
        record["series"] = socrata_record.get("category", "")

        return record

    def _rate_limited(max_per_second):
        """ Decorator that make functions not be called faster than a set rate """
        threading = __import__('threading')
        lock = threading.Lock()
        min_interval = 1.0 / float(max_per_second)

        def decorate(func):
            last_time_called = [0.0]

            @wraps(func)
            def rate_limited_function(*args, **kwargs):
                lock.acquire()
                elapsed = time.clock() - last_time_called[0]
                left_to_wait = min_interval - elapsed

                if left_to_wait > 0:
                    time.sleep(left_to_wait)

                lock.release()

                ret = func(*args, **kwargs)
                last_time_called[0] = time.clock()
                return ret

            return rate_limited_function

        return decorate

    @_rate_limited(5)
    def _update_record(self,record):

        try:            
            socrata_record = self.socratarepo.get_metadata(record['local_identifier'])
            oai_record = self.format_socrata_to_oai(socrata_record,record['local_identifier'])
            if oai_record:
                self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), self.domain_metadata)
            return True

        except Exception as e:
            self.logger.error("Updating record {} failed: {}".format(record['local_identifier'], e))
            # Touch the record so we do not keep requesting it on every run
            self.db.touch_record(record)
            self.error_count = self.error_count + 1
            if self.error_count < self.abort_after_numerrors:
                return True

        return False