Python json_normalize Exemples, pandas.json_normalize Python Exemples

Exemple #1

0

Afficher le fichier

def main():
    # Select your transport with a defined url endpoint
    access_token = os.environ['ACCESS_TOKEN']
    transport = AIOHTTPTransport(
        url=f"https://gis-api.aiesec.org/graphql/?access_token={access_token}")

    async def getData():
        # Create a GraphQL client using the defined transport
        async with Client(transport=transport,
                          fetch_schema_from_transport=True) as session:

            # Provide a GraphQL query
            query = gql("""
                query getApplicationList ($limit: Int, $start_date: DateTime, $end_date: DateTime){
                allOpportunityApplication(per_page: $limit, filters: {created_at: {from: $start_date, to: $end_date}}) {
                    data {
                    id
                    status
                    created_at
                    date_matched
                    date_pay_by_cash
                    date_approved
                    date_realized
                    experience_start_date
                    experience_end_date
                    date_approval_broken
                    nps_response_completed_at
                    updated_at
                    person {
                        id
                        full_name
                        contact_detail {
                            email
                            phone
                        }
                        home_mc {
                        name
                        }
                        home_lc {
                        name
                        }
                    }
                    host_lc {
                        name
                    }
                    host_mc: home_mc {
                        name
                    }
                    opportunity {
                        id
                        created_at
                        title
                        duration
                        sub_product {
                        name
                        }
                        programme {
                        short_name_display
                        }
                    }
                    standards {
                        option
                    }
                    }
                }
                }
            """)

            params = {
                "mc_id": [518],  # Bahrain's MC ID
                "start_date": "2021-01-01",
                "end_date": "",
                "limit": 1000  # Could be any large enough number
            }

            # Execute the query on the transport
            results = await session.execute(query, variable_values=params)
            # print(result)
            return results

    print("Executing query off of EXPA ...")
    apps_data = asyncio.run(getData())

    print("Started preprocessing...")
    # Reduce the dict by 3 Levels
    apps_data = apps_data['allOpportunityApplication']['data']

    #  Flatten dictionary and compress keys
    apps_df = pd.json_normalize(apps_data, sep='_')
    """
    Create new columns for easy comprehension
        * LC
        * Department
        * Partner_MC
        * Partner_LC
    """

    new_fields = ['department', 'lc', 'partner_mc', 'partner_lc']

    def generate_new_fields(row):
        if row['person_home_mc_name'] == 'Bahrain':
            values = [
                'o' + row['opportunity_programme_short_name_display'],
                row['person_home_lc_name'], row['host_mc_name'],
                row['host_lc_name']
            ]
        else:
            values = [
                'i' + row['opportunity_programme_short_name_display'],
                row['host_lc_name'], row['person_home_mc_name'],
                row['person_home_lc_name']
            ]
        return dict(zip(new_fields, values))

    print("Generating new fields and tables ...")
    apps_df[new_fields] = apps_df.apply(lambda row: generate_new_fields(row),
                                        axis=1,
                                        result_type='expand')

    pointless_cols = [
        'opportunity_programme_short_name_display', 'host_mc_name',
        'host_lc_name', 'person_home_mc_name', 'person_home_lc_name'
    ]
    apps_df.drop(pointless_cols, inplace=True, axis=1)
    """
    Produce Performance Analytics Table
        * First convert dates from longform to YYYY-MM-DD
        * Retain Date, LC, Dept, PartnerMC, PartnerLC, and the Status Column like # of Applications, Accepted etc.. will be the aggregation
    """

    date_cols = [
        'created_at', 'date_matched', 'date_approved', 'date_realized',
        'updated_at'
    ]
    multi_indices = ['lc', 'department', 'partner_mc', 'partner_lc']
    aggregration_fields = ['id', 'person_id']

    # Generate table with these columns only
    perf_table = apps_df[aggregration_fields + date_cols +
                         multi_indices].copy()

    # Ensure that dates are uniform and shortened
    perf_table.loc[:,
                   date_cols] = apps_df[date_cols].applymap(lambda x: x[:-10],
                                                            na_action='ignore')

    def get_timeseries_formetric(table: pd.DataFrame, other_fields: list,
                                 selected_date_col: str,
                                 metric_name: str) -> pd.DataFrame:
        table = table[[selected_date_col, *other_fields, *aggregration_fields]]
        _ = table.sort_values([selected_date_col, *other_fields])
        _[metric_name] = 1
        _.rename(columns={
            selected_date_col: "date",
            "id": "AppID",
            "person_id": "PersonID"
        },
                 inplace=True)
        return _.dropna(axis=0)

    apps_per_day = get_timeseries_formetric(perf_table, multi_indices,
                                            "created_at", "Applied")
    acc_per_day = get_timeseries_formetric(perf_table, multi_indices,
                                           "date_matched", "Accepted")
    apd_per_day = get_timeseries_formetric(perf_table, multi_indices,
                                           "date_approved", "Approved")

    perf_analysis_df = pd.concat([apps_per_day, acc_per_day, apd_per_day])
    perf_analysis_df.fillna(0, inplace=True, axis=0)

    # ### Push it to Google Sheets

    # Credentials from service account file for Google Sheets
    print("Creating temporary file for service account credentials...")

    temp = tempfile.NamedTemporaryFile()
    try:
        access_creds = os.environ['GOOGLE_CREDS']
        write_base64str_obj_to_file(access_creds, temp.name)
    finally:
        gc = pygsheets.authorize(service_file=temp.name)
        temp.close()

    print("Writing to Google Sheets...")
    workbook = gc.open_by_key(os.environ["SPREADSHEET_ID"])

    perf_worksheet = workbook.worksheet_by_title(
        os.environ["PerformanceSheet"])
    applications_worksheet = workbook.worksheet_by_title(
        os.environ["ApplicationsSheet"])

    # Create handy function to write to sheets
    set_worksheet_todf = partial(pygsheets.Worksheet.set_dataframe,
                                 start="A1",
                                 copy_head=True)

    set_worksheet_todf(perf_worksheet, perf_analysis_df)
    set_worksheet_todf(applications_worksheet, apps_df)
    print("Done!")

Exemple #2

0

Afficher le fichier

    async def missingfiles(self, human_readable=None, **params):
        """Show files which are missing from blocks at a node.
        Parameters
        ----------
        block            block name (wildcards) (*)
        lfn              logical file name (*)
        node             node name (wildcards)
        se               storage element.
        subscribed       y or n. whether the block is subscribed to the node or not
                         default is null (either)
        custodial        y or n. filter for custodial responsibility,
                         default is to return either
        group            group name
                         default is to return missing blocks for any group.

        (*) either block or lfn is required
        """

        resjson = await self.jsonmethod("missingfiles", **params)
        out = []
        if human_readable is not None and type(human_readable) is not bool:
            print("Wrong human_readable parameter type")
            df = pandas.json_normalize(out)
            return df
        elif human_readable is None or human_readable is False:
            for _block in resjson["phedex"]["block"]:
                for _file in _block["file"]:
                    for _missing in _file["missing"]:
                        out.append(
                            {
                                "block_name": _block["name"],
                                "file_name": _file["name"],
                                "checksum": _file["checksum"],
                                "size": _file["bytes"],
                                "created": _file["time_create"],
                                "origin_node": _file["origin_node"],
                                "missing_from": _missing["node_name"],
                                "disk": _missing["se"],
                                "custodial": _missing["custodial"],
                                "subscribed": _missing["subscribed"],
                            }
                        )
            df = pandas.json_normalize(out)
            return format_dates(df, ["created"])
        elif human_readable is True:
            for _block in resjson["phedex"]["block"]:
                for _file in _block["file"]:
                    for _missing in _file["missing"]:
                        out.append(
                            {
                                "Block Name": _block["name"],
                                "File Name": _file["name"],
                                "checksum": _file["checksum"],
                                "Size of file": _file["bytes"],
                                "Time created": _file["time_create"],
                                "Origin Node": _file["origin_node"],
                                "Missing from": _missing["node_name"],
                                "Disk": _missing["se"],
                                "Custodial?": _missing["custodial"],
                                "Subscribed?": _missing["subscribed"],
                            }
                        )
            df = pandas.json_normalize(out)
            return format_dates(df, ["Time created"])

Exemple #3

0

Afficher le fichier

    async def requestlist(self, human_readable=None, **params):
        """Serve as a simple request search and cache-able catalog of requests to save within a client,
        which may then use the request ID to obtain further details using TransferRequests or DeletionRequests.
        Parameters
        ----------
        request *        request id
        type             request type, 'xfer' (default) or 'delete'
        approval         approval state, 'approved', 'disapproved', 'mixed', or 'pending'
        requested_by *   requestor's name
        node *           name of the destination node
                         (show requests in which this node is involved)
        decision         decision at the node, 'approved', 'disapproved' or 'pending'
        group *          user group
        create_since     created since this time
        create_until     created until this time
        decide_since     decided since this time
        decide_until     decided until this time
        dataset *        dataset is part of request, or a block from this dataset
        block *          block is part of request, or part of a dataset in request
        decided_by *     name of person who approved the request

        * could be multiple and/or with wildcard
        ** when both 'block' and 'dataset' are present, they form a logical disjunction (ie. or)
        """
        resjson = await self.jsonmethod("requestlist", **params)
        out = []
        if human_readable is not None and type(human_readable) is not bool:
            df = pandas.json_normalize(out)
            raise Exception("Wrong human_readable parameter type")
            return df
        elif human_readable is None or human_readable is False:
            for _request in resjson["phedex"]["request"]:
                for _node in _request["node"]:
                    out.append(
                        {
                            "request_id": _request["id"],
                            "time_created": _request["time_create"],
                            "requested_by": _request["requested_by"],
                            "approval": _request["approval"],
                            "node": _node["name"],
                            "time_decided": _node["time_decided"],
                            "decided_by": _node["decided_by"],
                        }
                    )
            df = pandas.json_normalize(out)
            return format_dates(df, ["time_created", "time_decided"])

        else:
            for _request in resjson["phedex"]["request"]:
                for _node in _request["node"]:
                    out.append(
                        {
                            "Request ID": _request["id"],
                            "Time Created": _request["time_create"],
                            "Requested by": _request["requested_by"],
                            "Approval": _request["approval"],
                            "Node": _node["name"],
                            "Time decided": _node["time_decided"],
                            "Decided by": _node["decided_by"],
                        }
                    )
            df = pandas.json_normalize(out)
            return format_dates(df, ["Time Created", "Time decided"])

Exemple #4

0

Afficher le fichier

 def test_value_array_record_prefix(self):
     # GH 21536
     result = json_normalize({"A": [1, 2]}, "A", record_prefix="Prefix.")
     expected = DataFrame([[1], [2]], columns=["Prefix.0"])
     tm.assert_frame_equal(result, expected)

Exemple #5

0

Afficher le fichier

class StockTwits_BackUp:

    def open(self, filename):
        self.Text = gopen(filename, 'rt').readlines()
        self.TotalTweets = len(self.Text)



# %% prepare directory fil


files_to_analyze = Files(folder_name=Local_Settings.Messages_Folder, category=2, year=2009).FilesList
test = gopen(files_to_analyze[0], 'rt').readlines()
test = pd.read_json(files_to_analyze[0], lines=True)
TweetsDetails = pd.json_normalize(test['data'])
ReviewsSample = Reviews.join([RatingsDetail,ResponseDetails]).drop(['RatingsDetail', 'ResponseDetails'], axis=1)

new_df = pd.concat([pd.DataFrame(json_normalize(x)) for x in df['json']],ignore_index=True)


# %% open Loughran and McDonald's sentiment dictionary

lm_negative = Local_Settings.lm_dictionary.get('Negative')[0].tolist()
lm_positive = Local_Settings.lm_dictionary.get('Positive')[0].tolist()

def loughran_scores(text):
    twords = text.split()
    twords2 = [i.upper() for i in twords]
    len_twords2 = len(twords2)
    negative_found = len([i for i in twords2 if i in lm_negative])

Exemple #6

0

Afficher le fichier

Fichier : eterbase_api_order_book_data_source.py Projet : zhengger/hummingbot

    async def get_active_exchange_markets(cls) -> pd.DataFrame:
        """
        *required
        Returns all currently active BTC trading pairs from Eterbase, sorted by volume in descending order.
        """
        async with aiohttp.ClientSession() as client:
            async with client.get(f"{constants.REST_URL}/markets") as products_response:
                products_response: aiohttp.ClientResponse = products_response
                if products_response.status != 200:
                    raise IOError(f"Error fetching active Eterbase markets. HTTP status is {products_response.status}.")
                data = await products_response.json()
                for pair in data:
                    pair["symbol"] = convert_from_exchange_trading_pair(pair["symbol"])
                all_markets: pd.DataFrame = pd.DataFrame.from_records(data=data, index="id")
                all_markets.rename({"base": "baseAsset", "quote": "quoteAsset"},
                                   axis="columns", inplace=True)
                all_markets = all_markets[(all_markets.state == 'Trading')]
                ids: List[str] = list(all_markets.index)
                volumes: List[float] = []
                prices: List[float] = []

                tickers = None
                async with client.get(f"{constants.REST_URL}/tickers") as tickers_response:
                    tickers_response: aiohttp.ClientResponse = tickers_response
                    if tickers_response.status == 200:
                        data = await tickers_response.json()
                        tickers: pd.DataFrame = pd.DataFrame.from_records(data=data, index="marketId")
                    else:
                        raise IOError(f"Error fetching tickers on Eterbase. "
                                      f"HTTP status is {tickers_response.status}.")

                for product_id in ids:
                    volumes.append(float(tickers.loc[product_id].volume))
                    prices.append(float(tickers.loc[product_id].price))
                all_markets["volume"] = volumes
                all_markets["price"] = prices

                cross_rates = None
                async with client.get(f"{constants.REST_URL}/tickers/cross-rates") as crossrates_response:
                    crossrates_response: aiohttp.ClientResponse = crossrates_response
                    if crossrates_response.status == 200:
                        data = await crossrates_response.json()
                        cross_rates: pd.DataFrame = pd.json_normalize(data, record_path ='rates', meta = ['base'])
                    else:
                        raise IOError(f"Error fetching cross-rates on Eterbase. "
                                      f"HTTP status is {crossrates_response.status}.")

                usd_volume: List[float] = []
                cross_rates_ids: List[str] = list(cross_rates.base)
                for row in all_markets.itertuples():
                    quote_name: str = row.quoteAsset
                    quote_volume: float = row.volume
                    quote_price: float = row.price

                    found = False
                    for product_id in cross_rates_ids:
                        if quote_name == product_id:
                            rate: float = cross_rates.loc[(cross_rates['base'] == product_id) & (cross_rates['quote'].str.startswith("USDT"))].iat[0, 1]
                            usd_volume.append(quote_volume * quote_price * rate)
                            found = True
                            break
                    if found is False:
                        usd_volume.append(NaN)
                        cls.logger().error(f"Unable to convert volume to USD for market - {quote_name}.")
                all_markets["USDVolume"] = usd_volume
                return all_markets.sort_values(by = ["USDVolume"], ascending = False)

Exemple #7

0

Afficher le fichier

    def test_deprecated_import(self):
        with tm.assert_produces_warning(FutureWarning):
            from pandas.io.json import json_normalize

            recs = [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}]
            json_normalize(recs)

Exemple #8

0

Afficher le fichier

Fichier : prep-spatial-data.py Projet : kelly-gilbert/crnra-hiking-challenge

    else:  # current object is already a Linestring
        split_lines.append(river['geometry'].iloc[i])

river_merged = geopandas.GeoSeries(MultiLineString(split_lines))

river_merged.to_file(river_file_merged, driver='GeoJSON')

#-------------------------------------------------------------------------------
# output a file containing total miles by GEOMETRYID
#-------------------------------------------------------------------------------

data = json.load(open(trails_file, 'r'))

# extract the GEOMETRYID and coordinates from the geoJSON
df = json_normalize(data=data['features'])[[
    'properties.GEOMETRYID', 'geometry.coordinates'
]]
df.columns = ['GEOMETRYID', 'coordinates']

# extract the points to rows
df = df.explode('coordinates')

# get the coordinates of the next point
df['next_GEOMETRYID'] = df['GEOMETRYID'].shift(periods=-1)
df['next_coordinates'] = df['coordinates'].shift(periods=-1)
df = df[df['GEOMETRYID'] == df['next_GEOMETRYID']]

# split the lat/lon into columns
df.reset_index(inplace=True)
df[['start_lon', 'start_lat']] = DataFrame(df['coordinates'].tolist())
df[['end_lon', 'end_lat']] = DataFrame(df['next_coordinates'].tolist())

Exemple #9

0

Afficher le fichier

Fichier : tables.py Projet : BrettlyCD/boxscore

#setup recurring data tables with no dynamic gameID requirements

import requests as rq
import pandas as pd

###conference table
response = rq.get("https://statsapi.web.nhl.com/api/v1/conferences")
conferences = pd.json_normalize(response.json()["conferences"]).set_index("id")
conferences.rename_axis("conferenceID", inplace=True)

###division table
response = rq.get("https://statsapi.web.nhl.com/api/v1/divisions")
divisions = pd.json_normalize(response.json()["divisions"]).set_index("id")
divisions.rename_axis("divisionID", inplace=True)

###team table
response = rq.get("https://statsapi.web.nhl.com/api/v1/teams")
teams = pd.json_normalize(response.json()["teams"]).set_index("id")
teams.rename_axis("teamID", inplace=True)

###team standings
response = rq.get("https://statsapi.web.nhl.com/api/v1/standings")
team_standings = pd.json_normalize(response.json()['records'],
                                   record_path=['teamRecords'],
                                   errors='ignore').set_index("team.id")
team_standings.rename_axis("teamID", inplace=True)

###last played game by teamID
response = rq.get(
    "https://statsapi.web.nhl.com/api/v1/teams?expand=team.schedule.previous")
last = pd.json_normalize(

Exemple #10

0

Afficher le fichier

Fichier : dataframe_factory.py Projet : jchenRCL/dimcli

    def df_authors(self, data):
        """Utility 
        Returns inner json as a pandas dataframe, exposing authors + pubId.
        List of affiliations per each author are not broken down and are returned as JSON. 
        So in essence you get one row per author.

        NOTE this method works only for publications searches -and it's clever enough to know if `authors` or `author_affiliations` (deprecated) field is used.

        Each publication.author_affiliations object has a nested list structure like this:
        ```
        [[{'first_name': 'Laura',
            'last_name': 'Pasin',
            'orcid': '',
            'current_organization_id': '',
            'researcher_id': '',
            'affiliations': [{'name': 'Department of Anesthesia and Intensive Care, Ospedale S. Antonio, Via Facciolati, 71, Padova, Italy'}]},
            {'first_name': 'Sabrina',
            'last_name': 'Boraso',
            'orcid': '',
            'current_organization_id': '',
            'researcher_id': '',
            'affiliations': [{'name': 'Department of Anesthesia and Intensive Care, Ospedale S. Antonio, Via Facciolati, 71, Padova, Italy'}]},
            {'first_name': 'Ivo',
            'last_name': 'Tiberio',
            'orcid': '',
            'current_organization_id': '',
            'researcher_id': '',
            'affiliations': [{'name': 'Department of Anesthesia and Intensive Care, Ospedale S. Antonio, Via Facciolati, 71, Padova, Italy'}]}]]
        ```

        """
        output = pd.DataFrame()

        if 'publications' in self.good_keys:

            if exists_key_in_dicts_list(data['publications'],
                                        "author_affiliations"):
                FIELD = "author_affiliations"
            elif exists_key_in_dicts_list(data['publications'], "authors"):
                FIELD = "authors"
            else:
                FIELD = ""

            if FIELD == "author_affiliations":
                # simplify deep nested dict structure for deprecated field
                for x in data['publications']:
                    if 'author_affiliations' in x and x[
                            'author_affiliations']:  # if key exists and contents are not empty eg '[]'
                        if type(
                                x['author_affiliations'][0]
                        ) == list:  # then break down nested dict structure
                            x['author_affiliations'] = x[
                                'author_affiliations'][0]
                        elif type(
                                x['author_affiliations']
                            [0]) == dict:  # = it's already been broken down
                            pass
                    else:  # put in default empty element
                        x['author_affiliations'] = []
            elif FIELD == "authors":
                normalize_key("authors", data['publications'], [])

            if FIELD:
                output = json_normalize(data['publications'],
                                        record_path=[FIELD],
                                        meta=['id'],
                                        errors='ignore')
                output.rename(columns={"id": "pub_id"}, inplace=True)
        else:
            print(
                f"[Warning] Dataframe cannot be created as 'publications' were not found in data. Available: {self.good_keys}"
            )
        return output

Exemple #11

0

Afficher le fichier

from sqlalchemy import create_engine
import json
from cook import Archiver
import requests
import tempfile
import os

if __name__ == "__main__":
    url = "https://www.nycgovparks.org/bigapps/DPR_CapitalProjectTracker_001.json"
    data = json.loads(requests.get(url).content)
    df = pd.DataFrame(data)
    df = df[["TrackerID", "FMSID", "Title", "TotalFunding", "Locations"]]
    df["Locations"] = df["Locations"].apply(lambda x: x.get("Location"))
    df2 = df.drop(columns=["Locations"]).join(
        df["Locations"].explode().to_frame())
    horiz_exploded = pd.json_normalize(df2["Locations"])
    horiz_exploded.index = df2.index
    df3 = pd.concat([df2, horiz_exploded], axis=1).drop(columns=["Locations"])
    df3 = df3.rename(
        columns={
            "TrackerID": "proj_id",
            "FMSID": "fmsid",
            "Title": "desc",
            "TotalFunding": "total_funding",
            "ParkID": "park_id",
            "Latitude": "lat",
            "Longitude": "lon"
        })
    df3 = df3[[
        "proj_id", "fmsid", "desc", "total_funding", "park_id", "lat", "lon"
    ]]

Exemple #12

0

Afficher le fichier

Fichier : dataframe_factory.py Projet : jchenRCL/dimcli

    def df_concepts(self, data, key):
        """from a list of publications or grants including concepts, return a DF with one line per concept
        Enrich the dataframe with scores and other metrics.
        """

        FIELD_NAME = "concepts"
        FIELD_NAME_SCORES = "concepts_scores"
        ROUNDING = 5

        if not ('publications' in self.good_keys) and not ('grants'
                                                           in self.good_keys):
            s = f"Dataframe can be created only with searches returning 'publications' or 'grants' . Available: {self.good_keys}"
            raise Exception(s)

        concepts = self.df_simple(data, key)

        if (FIELD_NAME not in concepts.columns) and (FIELD_NAME_SCORES
                                                     not in concepts.columns):
            s = f"Dataframe requires raw concepts data, but no 'concepts' or 'concepts_scores' column was not found in: {concepts.columns.to_list()}"
            raise Exception(s)

        if not 'id' in concepts.columns:
            s = f"Dataframe requires an 'id' column for counting concepts, which was not found in: {concepts.columns.to_list()}"
            raise Exception(s)

        if FIELD_NAME_SCORES in concepts.columns:
            # use `concepts_scores` field preferably

            df = concepts.explode(FIELD_NAME_SCORES)
            df.dropna(subset=[FIELD_NAME_SCORES],
                      inplace=True)  # remove rows if there is no concept
            df.reset_index(inplace=True, drop=True)
            original_cols = [
                x for x in df.columns.to_list() if x != FIELD_NAME_SCORES
            ]
            df = df.drop(FIELD_NAME_SCORES, 1).assign(**pd.json_normalize(
                df[FIELD_NAME_SCORES]))  # unpack dict with new columns
            df = df[df.relevance != 0]  # remove 0-relevance scores
            df['relevance'] = df['relevance'].round(ROUNDING)
            df.rename(columns={"relevance": "score"}, inplace=True)
            df['frequency'] = df.groupby('concept')['concept'].transform(
                'count')
            df['concepts_count'] = df.groupby("id")['concept'].transform(
                'size')

        else:
            # with traditional 'concepts', scores are simulated

            df = concepts.explode(FIELD_NAME)
            original_cols = [
                x for x in df.columns.to_list() if x != FIELD_NAME
            ]
            df.dropna(subset=[FIELD_NAME],
                      inplace=True)  # remove rows if there is no concept
            df.rename(columns={FIELD_NAME: "concept"}, inplace=True)
            df['frequency'] = df.groupby('concept')['concept'].transform(
                'count')
            df['concepts_count'] = df.groupby("id")['concept'].transform(
                'size')
            ranks = df.groupby('id').cumcount() + 1
            # scores = normalized rank from 0 to 1, where 1 is the highest rank
            df['score'] = (
                (df['concepts_count'] + 1) - ranks) / df['concepts_count']
            df['score'] = df['score'].round(ROUNDING)

        # finally
        df['score_avg'] = df.groupby('concept')['score'].transform(
            'mean').round(ROUNDING)
        df.reset_index(drop=True, inplace=True)

        out_cols = original_cols + [
            'concepts_count', 'concept', 'score', 'frequency', 'score_avg'
        ]
        return df[out_cols]

Exemple #13

0

Afficher le fichier

def get_gif(access_token: str, min_lon: float, max_lat: float, max_lon: float,
            min_lat: float, ratio: float, colour: str, backgroundColour: str,
            alpha: float, activity_type: str, bg_img: str, duration: int):
    activities = requests.get('https://www.strava.com/api/v3/activities' +
                              '?access_token=' + access_token +
                              '&per_page=200' + '&page=' + str(1))
    activities = activities.json()

    # convert activities to pandas dataframe
    df = json_normalize(activities)

    # filter df by type of activity
    if activity_type == 'Run':
        df = df[df['type'] == 'Run']
    elif activity_type == 'Ride':
        df = df[df['type'] == 'Ride']
    else:
        df = df[(df['type'] == 'Run') | (df['type'] == 'Ride')]

    # filter df by start coordinates using the bounding box
    df_bbox = df[(df['start_latitude'] < max_lat)
                 & (df['start_latitude'] > min_lat) &
                 (df["start_longitude"] < max_lon)
                 & (df["start_longitude"] > min_lon)]

    df_bbox = df_bbox.sort_values(by=['start_date'])

    # create imagery based on bg_img
    if bg_img == 'sat':
        imagery = GoogleTiles(style='satellite')
    elif bg_img == 'osm':
        imagery = OSM()
    else:
        imagery = OSM()

    # create figure to plot routes on
    fig = plt.figure(figsize=(8, ratio * 8), frameon=False)
    ax = fig.add_subplot(1, 1, 1, projection=imagery.crs)

    fig.patch.set_visible(False)
    ax.set_extent([min_lon, max_lon, min_lat, max_lat])

    ax.set_axis_off()

    # filepaths
    fp_out = 'image.gif'

    imgs = []
    for i in range(len(df_bbox)):
        try:
            lat, lng = zip(
                *polyline.decode(df_bbox.iloc[i]['map.summary_polyline']))
        except:
            print(i)

        plt.plot(lng,
                 lat,
                 transform=ccrs.Geodetic(),
                 color=colour,
                 alpha=alpha)
        imgs.append(fig2img(fig))

    # create background image
    if bg_img == 'none':
        bg = Image.new(mode='RGBA',
                       size=imgs[0].size,
                       color=ImageColor.getrgb(backgroundColour))

    else:
        fig = plt.figure(figsize=(8, ratio * 8), frameon=False)
        ax = fig.add_subplot(1, 1, 1, projection=imagery.crs)
        ax.set_extent([min_lon, max_lon, min_lat, max_lat])
        fig.patch.set_visible(False)
        ax.set_axis_off()

        # set background imagery if one was sent
        ax.add_image(imagery, 15)

        # converting background to image
        bg = fig2img(fig)

    imgs = map(lambda img: Image.alpha_composite(bg, img), imgs)
    bg.save(fp=fp_out,
            format='GIF',
            append_images=imgs,
            save_all=True,
            duration=duration,
            loop=0)

    file = open('image.gif', 'rb')
    return {'gif': base64.b64encode(file.read())}

Exemple #14

0

Afficher le fichier

Fichier : KrxHistoricalDailyPriceDataDownloader.py Projet : LKH-1/koapy

    def download(self, symbol, start_date=None, end_date=None):
        if start_date is None:
            start_date = self._start_date
        if end_date is None:
            now = pd.Timestamp.now(self._calendar.tz)
            end_date = (
                self._calendar.previous_close(now)
                .astimezone(self._calendar.tz)
                .normalize()
            )

        full_code = self.get_full_code(symbol)

        url = "http://data.krx.co.kr/comm/bldAttendant/getJsonData.cmd"
        data = {
            "bld": self._bld,
            "isuCd": full_code,
            "isuCd2": "",
            "strtDd": start_date.strftime("%Y%m%d"),
            "endDd": end_date.strftime("%Y%m%d"),
            "share": "1",
            "money": "1",
            "csvxls_isNo": "false",
        }
        response = requests.post(url, data, headers=self._headers)
        df = pd.json_normalize(response.json()["output"])

        if df.shape[0] == 0:
            return None

        column_names = {
            "TRD_DD": "Date",
            "ISU_CD": "Code",
            "ISU_NM": "Name",
            "MKT_NM": "Market",
            "SECUGRP_NM": "SecuGroup",
            "TDD_CLSPRC": "Close",
            "FLUC_TP_CD": "UpDown",
            "CMPPRVDD_PRC": "Change",
            "FLUC_RT": "ChangeRate",
            "TDD_OPNPRC": "Open",
            "TDD_HGPRC": "High",
            "TDD_LWPRC": "Low",
            "ACC_TRDVOL": "Volume",
            "ACC_TRDVAL": "Amount",
            "MKTCAP": "MarCap",
            "CMPPREVDD_PRC": "Change",
            "LIST_SHRS": "Shares",
        }
        df.rename(columns=column_names, inplace=True)

        int_columns = [
            "Close",
            "UpDown",
            "Change",
            "ChangeRate",
            "Open",
            "High",
            "Low",
            "Volume",
            "Amount",
            "MarCap",
            "Shares",
        ]

        for col in int_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col].str.replace(",", ""), errors="coerce")

        df["Date"] = pd.to_datetime(df["Date"])
        df.set_index("Date", inplace=True)

        return df

Exemple #15

0

Afficher le fichier

        script = schema['metadata']['github_url']

        if param == 'ALL_SCHEMA':
            table_name_git = '{}{}'.format(
                schema['metadata']['TablePrefix'],
                os.path.basename(schema['metadata']['target_S3URI']).lower())
        else:
            try:
                table_name_git = schema['metadata']['TableName']
            except:
                table_name_git = '{}{}'.format(
                    schema['metadata']['TablePrefix'],
                    os.path.basename(
                        schema['metadata']['target_S3URI']).lower())

        tb = pd.json_normalize(schema['schema']).to_markdown()
        toc = "{}{}".format(github_link, table_name_git)
        top_readme += '\n- [{0}]({1})'.format(table_name_git, toc)

        README += template.format(table_name_git, DatabaseName, target_S3URI,
                                  partition, tb, script)
README = README.format(top_readme)
with open(os.path.join(str(Path(path).parent), '00_data_catalog/README.md'),
          "w") as outfile:
    outfile.write(README)

### Create toc
### Update TOC in Github
parent_path = str(Path(path).parent)
for p in [
        parent_path,

Exemple #16

0

Afficher le fichier

            if elem == old:
                doubles.append(elem)
                old = None
                continue
        old = elem
    return doubles


# TODO: implement checks:
# no duplicate area ids
# no duplicate area ids
# no duplicate area names
# no duplicate zone ids

with open('zones.yaml', 'r') as f:
    zone_list = pd.json_normalize(yaml.safe_load(f)['zones'])

with open('areas.yaml', 'r') as f:
    area_list = pd.json_normalize(yaml.safe_load(f)['areas'])

dangling_zones = find_zones_without_valid_areas(zone_list, area_list)
if len(dangling_zones) > 0:
    print('ERROR: found zones without area')
    print(dangling_zones)

duplicate_areas = find_duplicate_area_ids(area_list)
if len(duplicate_areas) > 0:
    print('ERROR: found duplicate area ids')
    print(duplicate_areas)

# check_area_refs(area_dataMap['areas'], zone_dataMap['zones'])

Exemple #17

0

Afficher le fichier

xg_reg_start = xgb.Booster({'nthread': 4})
xg_reg_end = xgb.Booster({'nthread': 4})
xg_reg_start.load_model('Modeling/xgb_trip_starts_py.model')
xg_reg_end.load_model('Modeling/xgb_trip_ends_py.model')

# read in the data
old_data = pd.read_sql("SELECT * FROM last_12 WHERE is_pred = 0", con=conn)
old_data['datetime'] = old_data.datetime.dt.tz_localize('America/New_York')

# read in latest json
station_status = pd.read_json(
    "https://gbfs.citibikenyc.com/gbfs/en/station_status.json")
datetime = pd.to_datetime(station_status['last_updated'], unit='s')\
    .dt.tz_localize('UTC')\
    .dt.tz_convert('America/New_York')[0]
station_status = pd.json_normalize(station_status['data']['stations'])

# only retain relavant columns
data_to_append = station_status[[
    'station_id', 'num_bikes_available', 'num_docks_available'
]].drop_duplicates()
data_to_append['datetime'] = datetime
data_to_append['is_pred'] = 0

# combine data and delete observations > 24 hours old
new_data = old_data.append(data_to_append).drop_duplicates()
new_data['is_pred'] = 0
boolean = pd.to_datetime(new_data.datetime,
                         utc=True).dt.tz_convert('America/New_York') >= (
                             datetime - dt.timedelta(hours=24))
new_data = new_data.loc[boolean, :].reset_index(drop=True)

Exemple #18

0

Afficher le fichier

    def get_result_table(config: dict, results: list, total_epochs: int, highlight_best: bool,
                         print_colorized: bool, remove_header: int) -> str:
        """
        Log result table
        """
        if not results:
            return ''

        tabulate.PRESERVE_WHITESPACE = True

        trials = json_normalize(results, max_level=1)
        trials['Best'] = ''
        if 'results_metrics.winsdrawslosses' not in trials.columns:
            # Ensure compatibility with older versions of hyperopt results
            trials['results_metrics.winsdrawslosses'] = 'N/A'

        trials = trials[['Best', 'current_epoch', 'results_metrics.trade_count',
                         'results_metrics.winsdrawslosses',
                         'results_metrics.avg_profit', 'results_metrics.total_profit',
                         'results_metrics.profit', 'results_metrics.duration',
                         'loss', 'is_initial_point', 'is_best']]
        trials.columns = ['Best', 'Epoch', 'Trades', 'W/D/L', 'Avg profit', 'Total profit',
                          'Profit', 'Avg duration', 'Objective', 'is_initial_point', 'is_best']
        trials['is_profit'] = False
        trials.loc[trials['is_initial_point'], 'Best'] = '*     '
        trials.loc[trials['is_best'], 'Best'] = 'Best'
        trials.loc[trials['is_initial_point'] & trials['is_best'], 'Best'] = '* Best'
        trials.loc[trials['Total profit'] > 0, 'is_profit'] = True
        trials['Trades'] = trials['Trades'].astype(str)

        trials['Epoch'] = trials['Epoch'].apply(
            lambda x: '{}/{}'.format(str(x).rjust(len(str(total_epochs)), ' '), total_epochs)
        )
        trials['Avg profit'] = trials['Avg profit'].apply(
            lambda x: '{:,.2f}%'.format(x).rjust(7, ' ') if not isna(x) else "--".rjust(7, ' ')
        )
        trials['Avg duration'] = trials['Avg duration'].apply(
            lambda x: '{:,.1f} m'.format(x).rjust(7, ' ') if not isna(x) else "--".rjust(7, ' ')
        )
        trials['Objective'] = trials['Objective'].apply(
            lambda x: '{:,.5f}'.format(x).rjust(8, ' ') if x != 100000 else "N/A".rjust(8, ' ')
        )

        trials['Profit'] = trials.apply(
            lambda x: '{:,.8f} {} {}'.format(
                x['Total profit'], config['stake_currency'],
                '({:,.2f}%)'.format(x['Profit']).rjust(10, ' ')
            ).rjust(25+len(config['stake_currency']))
            if x['Total profit'] != 0.0 else '--'.rjust(25+len(config['stake_currency'])),
            axis=1
        )
        trials = trials.drop(columns=['Total profit'])

        if print_colorized:
            for i in range(len(trials)):
                if trials.loc[i]['is_profit']:
                    for j in range(len(trials.loc[i])-3):
                        trials.iat[i, j] = "{}{}{}".format(Fore.GREEN,
                                                           str(trials.loc[i][j]), Fore.RESET)
                if trials.loc[i]['is_best'] and highlight_best:
                    for j in range(len(trials.loc[i])-3):
                        trials.iat[i, j] = "{}{}{}".format(Style.BRIGHT,
                                                           str(trials.loc[i][j]), Style.RESET_ALL)

        trials = trials.drop(columns=['is_initial_point', 'is_best', 'is_profit'])
        if remove_header > 0:
            table = tabulate.tabulate(
                trials.to_dict(orient='list'), tablefmt='orgtbl',
                headers='keys', stralign="right"
            )

            table = table.split("\n", remove_header)[remove_header]
        elif remove_header < 0:
            table = tabulate.tabulate(
                trials.to_dict(orient='list'), tablefmt='psql',
                headers='keys', stralign="right"
            )
            table = "\n".join(table.split("\n")[0:remove_header])
        else:
            table = tabulate.tabulate(
                trials.to_dict(orient='list'), tablefmt='psql',
                headers='keys', stralign="right"
            )
        return table

Exemple #19

0

Afficher le fichier

# Authenticate ourselves using tweepy so we can scrape the feed of CARES Bot
auth = tweepy.OAuthHandler(config.api_key, config.secret_key)
auth.set_access_token(config.access_token, config.access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

cares_bot_tweets = []

# Iteratively scrape the tweets from CARES Bot
for tweet in tweepy.Cursor(api.user_timeline,
                           screen_name="ExtendCaresUI").items():
    cares_bot_tweets.append(tweet)

cares_bot = pd.Series()

# Since we only want the URL of the image, we'll normalize the JSON data and extract the media_url key, which
# has the image link as a URL
for tweet in cares_bot_tweets:
    try:
        cares_bot = cares_bot.append(pd.json_normalize(
            tweet._json, ["entities", "media"])["media_url"],
                                     ignore_index=True)
        # It looks like some tweets don't have a media_url (perhaps when Data For Progress launched this bot, they had tweets with no image). Given
        # this, let's skip over these tweets
    except KeyError:
        continue

# Write out the image URLs to a CSV file so we have a static representation of the data that was pulled and so
# the scraper doesn't have to be rerun
cares_bot.to_csv(
    "C:/Users/Rober/DATS_6103/project_2/data/cares_bot_image_urls.csv")

Exemple #20

0

Afficher le fichier

    def export_csv_file(config: dict, results: list, total_epochs: int, highlight_best: bool,
                        csv_file: str) -> None:
        """
        Log result to csv-file
        """
        if not results:
            return

        # Verification for overwrite
        if Path(csv_file).is_file():
            logger.error(f"CSV file already exists: {csv_file}")
            return

        try:
            io.open(csv_file, 'w+').close()
        except IOError:
            logger.error(f"Failed to create CSV file: {csv_file}")
            return

        trials = json_normalize(results, max_level=1)
        trials['Best'] = ''
        trials['Stake currency'] = config['stake_currency']

        base_metrics = ['Best', 'current_epoch', 'results_metrics.trade_count',
                        'results_metrics.avg_profit', 'results_metrics.total_profit',
                        'Stake currency', 'results_metrics.profit', 'results_metrics.duration',
                        'loss', 'is_initial_point', 'is_best']
        param_metrics = [("params_dict."+param) for param in results[0]['params_dict'].keys()]
        trials = trials[base_metrics + param_metrics]

        base_columns = ['Best', 'Epoch', 'Trades', 'Avg profit', 'Total profit', 'Stake currency',
                        'Profit', 'Avg duration', 'Objective', 'is_initial_point', 'is_best']
        param_columns = list(results[0]['params_dict'].keys())
        trials.columns = base_columns + param_columns

        trials['is_profit'] = False
        trials.loc[trials['is_initial_point'], 'Best'] = '*'
        trials.loc[trials['is_best'], 'Best'] = 'Best'
        trials.loc[trials['is_initial_point'] & trials['is_best'], 'Best'] = '* Best'
        trials.loc[trials['Total profit'] > 0, 'is_profit'] = True
        trials['Epoch'] = trials['Epoch'].astype(str)
        trials['Trades'] = trials['Trades'].astype(str)

        trials['Total profit'] = trials['Total profit'].apply(
            lambda x: '{:,.8f}'.format(x) if x != 0.0 else ""
        )
        trials['Profit'] = trials['Profit'].apply(
            lambda x: '{:,.2f}'.format(x) if not isna(x) else ""
        )
        trials['Avg profit'] = trials['Avg profit'].apply(
            lambda x: '{:,.2f}%'.format(x) if not isna(x) else ""
        )
        trials['Avg duration'] = trials['Avg duration'].apply(
            lambda x: '{:,.1f} m'.format(x) if not isna(x) else ""
        )
        trials['Objective'] = trials['Objective'].apply(
            lambda x: '{:,.5f}'.format(x) if x != 100000 else ""
        )

        trials = trials.drop(columns=['is_initial_point', 'is_best', 'is_profit'])
        trials.to_csv(csv_file, index=False, header=True, mode='w', encoding='UTF-8')
        logger.info(f"CSV file created: {csv_file}")

Exemple #21

0

Afficher le fichier

 def test_empty_array(self):
     result = json_normalize([])
     expected = DataFrame()
     tm.assert_frame_equal(result, expected)

Exemple #22

0

Afficher le fichier

Fichier : dataPreprocessing.py Projet : prashantkodali/hinglishNorm

    df[columnName] = df[columnName].apply(apply_transliteration)
    df[columnName] = df[columnName].apply(to_lowerCase)
    df[columnName] = df[columnName].apply(process_URLs)
    df[columnName] = df[columnName].apply(filter_alpha_numeric)
    df[columnName] = df[columnName].apply(remove_punctuations)
    df[columnName] = df[columnName].apply(remove_non_ascii)
    df[columnName] = df[columnName].apply(trim)
    df[columnName] = df[columnName].apply(strip_whiteSpaces)
    df = remove_empty(df, columnName)
    df = df.reset_index(drop=True)
    print("Processing  Complete !!")
    return df


############################ END ###############################

######################## Main Function #########################

if __name__ == "__main__":
    # Read data from command line
    data = sys.argv[1]
    with open(data) as f:
        json_data = json.load(f)
    df = pd.json_normalize(json_data)
    df = df.reindex(columns=list(json_data[0].keys()))
    # Preprocess Dataset
    df = preprocess(df, 'inputText')
    df.to_json(data.split('.')[0] + "_preprocessed.json", orient='records')

###################### END ###############################

Exemple #23

0

Afficher le fichier

Fichier : main.py Projet : cesarirnan/uoc-TCVD-PRA1

def getHistoricData():
    start_date = timer()
    print(start_date)
    with open('venv/data/search_results_output.jsonl', 'r') as json_file:
        json_list = list(json_file)
    search_df = pd.DataFrame(columns=[
        'asin', 'title', 'url', 'rating', 'reviews', 'price_crape',
        'search_url', 'crape_date'
    ])
    df_iniciado = False

    for index, json_str in enumerate(json_list):
        print("iteracion " + str(index + 1) + " de " + str(len(json_list)))

        result = json.loads(json_str)

        print("New Element")
        asin = re.search("B0[\d\w]{8}", str(result['url']))

        # print("asin -> " + asin.group())
        # print("title -> " + str(result['title']))
        # print("url -> " + str(result['url']))
        # print("rating -> " + str(result['rating']))
        # print("reviews -> " + str(result['reviews']))
        # print("price -> " + str(result['price']))
        # print("search_url -> " + str(result['search_url']))

        #Eliminar los que contengan picasso en la url ya que son productos promocionados, que pueden no tener nada que ver
        if "picassoRedirect" not in str(result['url']):
            new_row = {
                'asin': asin.group(),
                'title': str(result['title']),
                'url': str(result['url']),
                'rating': str(result['rating']),
                'reviews': str(result['reviews']),
                'price_crape': str(result['price']),
                'search_url': str(result['search_url']),
                'crape_date': str(datetime.now().strftime("%m/%d/%Y"))
            }

            try:
                inter_date_start = timer()
                json_response = amazonPriceRequest(asin.group())

                json_normalized = json_normalize(
                    data=json_response.json(),
                    record_path='price_history',
                    meta=['asin', 'currency', 'price_type'])
                if df_iniciado == False:
                    json_df = json_normalized
                    df_iniciado = True
                else:
                    json_df = json_df.append(json_normalized,
                                             ignore_index=True,
                                             sort=False)
                search_df = search_df.append(new_row, ignore_index=True)
                inter_date_end = timer()
                print(timedelta(seconds=inter_date_end - inter_date_start))
            except:
                print("Response - KO")

    df_merge = pd.merge(json_df, search_df, on='asin')
    df_merge.to_csv(r'venv/data/OSFPD.csv', index=False, header=True)
    end_date = timer()
    print(timedelta(seconds=end_date - start_date))
    pass

Exemple #24

0

Afficher le fichier

Fichier : file_controller.py Projet : fujitsu/controlled-vocabulary-designer

def _download_file_make(pl_simple):
    g = rdflib.ConjunctiveGraph()
    g.bind("skos", rdflib.namespace.SKOS)

    # make triples
    # Base
    rtype = rdflib.namespace.RDF.type  # Type
    scon = rdflib.namespace.SKOS.Concept  # Concept
    plabel = rdflib.namespace.SKOS.prefLabel  # prefLabel
    alabel = rdflib.namespace.SKOS.altLabel  # altLabel
    broader = rdflib.namespace.SKOS.broader  # broader
    narrower = rdflib.namespace.SKOS.narrower  # narrower
    # add List
    namel = []
    # broader
    name_bt = []
    # narrower
    name_nw = []

    # JSON convert to pandas.DataFrame
    nm = pd.json_normalize(pl_simple)

    # JSON query Get Concept, prefLabel and narrower base
    namelpl = nm.query('term == preferred_label and uri != ""')
    # get uri and term
    namelx = namelpl.loc[:, ['term', 'uri']].values
    for name in namelx:
        # print('prefLabel:'+str(name[0])+' '+str(name[1]))
        nameb = [rdflib.URIRef(str(name[1])), rtype, scon]
        namel.append(nameb)
        nameb = [
            rdflib.URIRef(str(name[1])), plabel,
            rdflib.Literal(str(name[0]))
        ]
        namel.append(nameb)
        # narrower
        _add_check_term(name_nw, name[0], name[0], name[1])

    # query altLabel
    namelal = nm.query('term != preferred_label and uri != ""')
    # get uri and term
    namelx = namelal.loc[:, ['term', 'uri']].values
    for name in namelx:
        # print('altLabel:' + str(name[0])+' '+str(name[1]))
        nameb = [
            rdflib.URIRef(str(name[1])), alabel,
            rdflib.Literal(str(name[0]))
        ]
        namel.append(nameb)

    # create broader links
    # query broader_term
    namelbt = nm.query('broader_term != "" and uri != ""')
    # get uri and broader_term
    namelx = namelbt.loc[:, ['broader_term', 'term', 'uri']].values
    for name in namelx:
        _add_check_term(name_bt, name[0], name[1], name[2])

    for namebt in name_bt:
        # print('namebt:', str(namebt[0]),
        #       str(namebt[1]), str(namebt[2]))
        # query prefLabel
        wkquery =\
            'term == preferred_label and term == "' + str(namebt[0]) + '"'
        # print(wkquery)
        namelpl = nm.query(wkquery)
        # get uri and term
        namelx = namelpl.loc[:, ['term', 'uri']].values
        for name in namelx:
            nameb = [
                rdflib.URIRef(str(namebt[2])), broader,
                rdflib.URIRef(str(name[1]))
            ]
            namel.append(nameb)
            # print('add broader:'+str(name[0])+' '+str(name[1]))

    # print("--- printing narrower ---")
    # create narrower links
    for namenw in name_nw:
        # query prefLabel
        wkquery =\
            'term == preferred_label and uri != "" and broader_term == "' +\
            str(namenw[0]) + '"'
        # print(wkquery)
        namelpl = nm.query(wkquery)
        # get uri and term
        namelx = namelpl.loc[:, ['term', 'uri']].values
        for name in namelx:
            nameb = [
                rdflib.URIRef(str(namenw[2])), narrower,
                rdflib.URIRef(str(name[1]))
            ]
            namel.append(nameb)
            # print('add narrower:'+str(name[0])+' '+str(name[1]))
    # Add List to Graph
    for name in namel:
        g.add((name[0], name[1], name[2]))
    return g

Exemple #25

0

Afficher le fichier

    async def filereplicas(self, human_readable=None, **params):

        """Serves the file replicas known to phedex.
        Parameters
        ----------
        block          block name, with '*' wildcards, can be multiple (*).  required when no lfn is specified. Block names must
                       follow the syntax /X/Y/Z#, i.e. have three /'s and a '#'. Anything else is rejected.
        dataset        dataset name. Syntax: /X/Y/Z, all three /'s obligatory. Wildcads are allowed.
        node           node name, can be multiple (*)
        se             storage element name, can be multiple (*)
        update_since   unix timestamp, only return replicas updated since this
                time
        create_since   unix timestamp, only return replicas created since this
                       time
        complete       y or n. if y, return only file replicas from complete block
                       replicas.  if n only return file replicas from incomplete block
                       replicas.  default is to return either.
        dist_complete  y or n.  if y, return only file replicas from blocks
                       where all file replicas are available at some node. if
                       n, return only file replicas from blocks which have
                       file replicas not available at any node.  default is
                       to return either.
        subscribed     y or n, filter for subscription. default is to return either.
        custodial      y or n. filter for custodial responsibility.  default is
                       to return either.
        group          group name.  default is to return replicas for any group.
        lfn            logical file name
        """
        if type(human_readable) is not bool and human_readable is not None:
            raise Exception("Wrong human_readable parameter type")
        resjson = await self.jsonmethod("filereplicas", **params)
        out = []
        for _block in resjson["phedex"]["block"]:
            for _file in _block["file"]:
                for _replica in _file["replica"]:
                    out.append(
                        {
                            "Block_name": _block["name"],
                            "Files": _block["files"],
                            "Block_size_(GB)": _block["bytes"] / 1000000000.0,
                            "lfn": _file["name"],
                            "Checksum": _file["checksum"],
                            "File_created_on": _file["time_create"],
                            "File_replica_at": _replica["node"],
                            "File_subcribed": _replica["subscribed"],
                            "Custodial": _replica["custodial"],
                            "Group": _replica["group"],
                            "File_in_node_since": _replica["time_create"],
                        }
                    )
        df = pandas.json_normalize(out)
        format_dates(df, ["File_created_on", "File_in_node_since"])
        if human_readable is True:
            mapping = {
                "Block_name": "Block Name",
                "Block_size_(GB)": "Block size (GB)",
                "File_created_on": "File Created On",
                "File_replica_at": "File Replica At",
                "File_subcribed": "File Subcribed",
                "File_in_node_since": "File In Node Since",
            }
            df2 = df.rename(columns=mapping)
            return df2
        else:
            return df

Exemple #26

0

Afficher le fichier

Fichier : file_controller.py Projet : fujitsu/controlled-vocabulary-designer

def _download_file_ev_serialize(pl_simple, p_format):
    # format is csv or xlsx
    df_json = []
    df_json = pd.json_normalize(pl_simple)
    # print("--- printing "+p_format+" ---")
    df_org = df_json.copy()
    # delete word "[","]"
    df_org['synonym_candidate'] =\
        df_org['synonym_candidate'].astype("string")
    df_org['broader_term_candidate'] =\
        df_org['broader_term_candidate'].astype("string")
    df_org['synonym_candidate'] =\
        df_org['synonym_candidate'].str.replace('[', '')
    df_org['synonym_candidate'] =\
        df_org['synonym_candidate'].str.replace(']', '')
    df_org['synonym_candidate'] =\
        df_org['synonym_candidate'].str.replace('\'', '')
    df_org['broader_term_candidate'] =\
        df_org['broader_term_candidate'].str.replace('[', '')
    df_org['broader_term_candidate'] =\
        df_org['broader_term_candidate'].str.replace(']', '')
    df_org['broader_term_candidate'] =\
        df_org['broader_term_candidate'].str.replace('\'', '')
    # delete columns id hidden
    df_org.drop(columns=['id', 'hidden'], inplace=True)
    # header change
    df_org = df_org.rename(
        columns={
            'term': '用語名',
            'preferred_label': '代表語',
            'uri': '代表語のURI',
            'broader_term': '上位語',
            'broader_term_candidate': '上位語候補',
            'synonym_candidate': '同義語候補',
            'part_of_speech': '品詞',
            'position_x': 'x座標値',
            'position_y': 'y座標値',
            'color1': '色1',
            'color2': '色2',
            'confirm': '確定済み用語'
        })

    if p_format == 'csv':
        with tempfile.TemporaryFile("w+") as f:
            # encoding='utf-8', index=False
            df_org.to_csv(f, encoding='utf-8', index=False)
            f.seek(0)
            response = make_response()
            response.data = f.read()
            response.headers['Content-Type'] = 'text/csv'
            response.headers['Content-Disposition'] =\
                'attachment; filename=test_sample.csv'
            return response
    elif p_format == 'xlsx':
        downloadFileName = 'temp_excel.xlsx'
        df_org.to_excel(downloadFileName, encoding='utf-8', index=False)
        response = make_response()
        response.data = open(downloadFileName, "rb").read()
        response.headers['Content-Disposition'] = 'attachment;'
        response.mimetype = XLSX_MIMETYPE
        os.remove(downloadFileName)
        return response

Exemple #27

0

Afficher le fichier

 async def blocklatency(self, human_readable=None, **params):
     """Show authentication state and abilities
     Parameters
     ----------
     ability        authorization ability.  If passed then the nodes (from TMDB)
                    that the user is allowed to use "ability" for are returned.
     require_cert   if passed then the call will die if the user is not
                    authenticated by certificate
     require_passwd if passed then the call will die if the user is not
                    authenticated by password
     """
     resjson = await self.jsonmethod("blocklatency", **params)
     out = []
     if human_readable is not None and type(human_readable) is not bool:
         print("Wrong human_readable parameter type")
         df = pandas.json_normalize(out)
         return df
     elif human_readable is None or human_readable is False:
         for _block in resjson["phedex"]["block"]:
             for _destination in _block["destination"]:
                 for _latency in _destination["latency"]:
                     out.append(
                         {
                             "Block": _block["name"],
                             "Block_ID": _block["id"],
                             "Dataset": _block["dataset"],
                             "Size": _block["bytes"],
                             "Time_create": _block["time_create"],
                             "Number_of_files": _block["files"],
                             "Time_update": _block["time_update"],
                             "Destination": _destination["name"],
                             "custodial": _latency["is_custodial"],
                             "last_suspend": _latency["last_suspend"],
                             "last_replica": _latency["last_replica"],
                             "time_subscription": _latency["time_subscription"],
                             "block_closed": _latency["block_close"],
                             "latency": _latency["latency"],
                         }
                     )
         df = pandas.json_normalize(out)
         return format_dates(
             df,
             [
                 "Time_update",
                 "last_suspend",
                 "last_replica",
                 "time_subscription",
                 "block_closed",
                 "Time_create",
             ],
         )
     elif human_readable is True:
         for _block in resjson["phedex"]["block"]:
             for _destination in _block["destination"]:
                 for _latency in _destination["latency"]:
                     out.append(
                         {
                             "Block": _block["name"],
                             "Block ID": _block["id"],
                             "Dataset": _block["dataset"],
                             "Size": _block["bytes"],
                             "Time Create": _block["time_create"],
                             "Number of files": _block["files"],
                             "Time Update": _block["time_update"],
                             "Destination": _destination["name"],
                             "custodial": _latency["is_custodial"],
                             "Last Suspend": _latency["last_suspend"],
                             "Last Replica": _latency["last_replica"],
                             "Time Subscription": _latency["time_subscription"],
                             "Block Closed": _latency["block_close"],
                             "Latency": _latency["latency"],
                         }
                     )
         df = pandas.json_normalize(out)
         return format_dates(
             df,
             [
                 "Time Update",
                 "Last Suspend",
                 "Last Replica",
                 "Time Subscription",
                 "Block Closed",
                 "Time Create",
             ],
         )

Exemple #28

0

Afficher le fichier

def main():
    global Verbose_Flag

    parser = optparse.OptionParser()

    parser.add_option('-v',
                      '--verbose',
                      dest="verbose",
                      default=False,
                      action="store_true",
                      help="Print lots of output to stdout")
    parser.add_option("--config",
                      dest="config_filename",
                      help="read configuration from FILE",
                      metavar="FILE")

    parser.add_option(
        '-C',
        '--containers',
        dest="containers",
        default=False,
        action="store_true",
        help="for the container enviroment in the virtual machine")

    options, remainder = parser.parse_args()

    Verbose_Flag = options.verbose
    if Verbose_Flag:
        print('ARGV      :', sys.argv[1:])
        print('VERBOSE   :', options.verbose)
        print('REMAINING :', remainder)

    if options.config_filename:
        print("Configuration file : {}".format(options.config_filename))

    initialize(options)

    my_courses = list_my_courses()
    print("len(my_courses) are {0}".format(len(my_courses)))

    # set up the output write
    writer = pd.ExcelWriter('users_in_my_courses.xlsx', engine='xlsxwriter')

    for course in my_courses:
        if course['name'].find('do not use') >= 0:
            print("course id={0}  name={1} -- skipping".format(
                course['id'], course['name']))
            continue

        # if not (course['id'] in [16039, 17234]): # for testing only look at these courses
        #     continue
        if (course['id'] in [
                85,  # Canvas at KTH
                4996,  # Canvas at KTH 2.0 - New structure
                5733,  # Grunder, resultathantering och attestering för kursledare och examinatorer. (sv/en)
                8356,  # GDPR@KTH
                17839,  # Miljöutbildning
                18339  # Vårt uppdrag
        ]):  # skip the courses over all KTH faculty and staff
            continue

        print("course id={0}  name={1}".format(course['id'], course['name']))

        users = users_in_course(course['id'])
        if Verbose_Flag:
            print("users are: {0}".format(users))
        if (users):
            users_df = pd.json_normalize(users)

            # below are examples of some columns that might be dropped
            columns_to_drop = [
                'associated_user_id', 'course_integration_id', 'created_at',
                'end_at', 'enrollment_state', 'grades.current_grade',
                'grades.current_score', 'grades.final_grade',
                'grades.final_score', 'grades.html_url',
                'grades.unposted_current_grade',
                'grades.unposted_current_score', 'grades.unposted_final_grade',
                'grades.unposted_final_score', 'html_url', 'id',
                'last_activity_at', 'last_attended_at',
                'limit_privileges_to_course_section', 'role', 'role_id',
                'root_account_id', 'section_integration_id', 'sis_account_id',
                'sis_section_id', 'start_at', 'total_activity_time', 'type',
                'updated_at', 'user.created_at', 'user.id',
                'user.integration_id'
            ]
            # keep the following:
            # 'sis_course_id',
            # 'sis_user_id',
            # 'user.login_id',
            # 'user.name',
            # 'user.short_name'
            # 'user.sis_user_id'
            # 'user.sortable_name,'
            # 'user_id'

            users_df.drop(columns_to_drop, inplace=True, axis=1)

            # the following was inspired by the section "Using XlsxWriter with Pandas" on http://xlsxwriter.readthedocs.io/working_with_pandas.html
            course_sheet_name = "{0}".format(course['name'])
            if (len(course_sheet_name) > 30):
                course_sheet_name = course_sheet_name[0:29]
            course_sheet_name = course_sheet_name.replace(':', '-')
            users_df.to_excel(writer, sheet_name=course_sheet_name)

    # Close the Pandas Excel writer and output the Excel file.
    writer.save()

Exemple #29

0

Afficher le fichier

def create_movie_info(num_rows: int):

    ##Creation of movie_info csv
    all_movies = pd.read_json("mflix_movies.json").rename(columns={
        "_id": "movie_id"
    }).iloc[0:num_rows, :]

    #Released is dropped due to incorrect values
    award_df = pd.json_normalize(all_movies['awards']).rename(
        columns={
            "wins": "award_wins",
            "nominations": "award_nominations",
            "text": "award_text"
        })
    imdb_df = pd.json_normalize(all_movies['imdb']).rename(columns={
        "rating": "imdb_rating",
        "votes": "imdb_votes",
        "id": "imdb_id"
    })
    tomatoes_df = all_movies["tomatoes"]

    ## fill na values with empty dicts
    tomatoes_df = tomatoes_df.fillna({i: {} for i in tomatoes_df.index})
    tomatoes_df_flat = pd.json_normalize(tomatoes_df)
    tomatoes_df_final = tomatoes_df_flat.rename(
        columns={
            "viewer.rating": "tomato_viewer_rating",
            "viewer.numReviews": "tomato_viewer_num_reviews",
            "viewer.meter": "tomato_viewer_meter",
            "lastUpdated.$date": "tomato_lastupdated",
            "fresh": "tomato_fresh",
            "rotten": "tomato_rotten",
            "critic.rating": "tomato_critic_rating",
            "critic.numReviews": "tomato_critic_num_reviews",
            "critic.meter": "tomato_critic_meter",
            "dvd.$date": "tomato_dvd_date",
            "website": "tomato_website",
            "production": "tomato_production",
            "consensus": "tomato_consensus"
        }).drop("dvd.$date.$numberLong", axis=1)

    #Released is dropped due to incorrect values
    movies_info = all_movies.drop([
        "genres", "directors", "countries", "cast", "writers", "languages",
        "released", "awards", "imdb", "tomatoes"
    ],
                                  axis=1)
    movies_info = pd.merge(movies_info,
                           award_df,
                           right_index=True,
                           left_index=True)
    movies_info = pd.merge(movies_info,
                           imdb_df,
                           right_index=True,
                           left_index=True)
    movies_info = pd.merge(movies_info,
                           tomatoes_df_final,
                           right_index=True,
                           left_index=True)
    #change id to correct key value from dict
    movies_info["movie_id"] = movies_info["movie_id"].apply(
        lambda x: x["$oid"])

    return movies_info, all_movies

Exemple #30

0

Afficher le fichier

Fichier : back_app_cenace.py Projet : Rorevilla/Servicios-Conexos-CENACE

            anio_fin = lista_fechas_final[i].year
            mes_fin = str(lista_fechas_final[i].month).zfill(2)
            dia_fin = str(lista_fechas_final[i].day).zfill(2)
            print(anio_ini)
            print(mes_ini)
            print(dia_ini)

            address = "https://ws01.cenace.gob.mx:8082/SWPSC/SIM/" + sistema + "/" + proceso + "/" + str(
                anio_ini) + "/" + str(mes_ini) + "/" + str(
                    dia_ini) + "/" + str(anio_fin) + "/" + str(
                        mes_fin) + "/" + str(dia_fin) + "/json"
            print(address)

            info = requests.get(address).json()
            proceso = info["proceso"]
            sistema = info["sistema"]
            info = info["Resultados"]

            for i in range(len(info)):
                temporal = pd.json_normalize(info[i], 'Valores')
                temporal['clv_zona_reserva'] = "ZONA " + str(i + 1)
                print(temporal.head(10))
                temporal['proceso'] = proceso
                temporal['sistema'] = sistema
                appended_data_list.append(temporal)

            appended_data = pd.concat(appended_data_list)
csv_file = 'cenace_' + str(
    datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
appended_data.to_csv(csv_file, index=False)