Example #1
0
    def extract_insult_tweets(self):

        try:
            insults_df = read_csv(self.loc.format('data/insults.csv'))
        except FileNotFoundError:
            self.logger.critical('Insults file not found.  extract_insults.py must be run first')

        # get the schema of a Tweet
        tweets_df = json_normalize(self.api.get_status(insults_df.loc[0, 'tweet_id'])._json)
        tweets_df.drop(0, inplace=True)  # delete the data

        # build a DF of tweet data for each tweet in insults data
        # iterate in chunks so we can use twitter's GET statuses endpoint for bulk search

        n_insults = len(insults_df)
        chunksize = 100

        cursor = 0
        while cursor < n_insults:

            cursor_end_pos = min([cursor + chunksize - 1, n_insults - 1])
            self.logger.debug('Loading tweets {0}-{1}'.format(cursor, cursor_end_pos))

            tweet_ids = insults_df.ix[cursor: cursor_end_pos, 'tweet_id'].tolist()
            res = self.api.statuses_lookup(tweet_ids)
            for item in res:
                tweet = json_normalize(item._json)
                tweets_df = tweets_df.append(tweet)

            cursor += chunksize

        self.insult_tweets_df = tweets_df
Example #2
0
def decode_report(rpt_path):
    #read report from json into a dict
    with open(rpt_path, 'r') as f:
        read_rpt = json.loads(f.read())

    #parse the geojson
    def df_clean(uncleandf):
        cleaned_cols = [x.split('.')[-1] for x in uncleandf.columns]
        uncleandf.columns = cleaned_cols
        clean_df = uncleandf.rename(columns={'coordinates':'coords'}).drop(['type'], axis=1)
        clean_df = clean_df.set_index(['Name'])
        return clean_df

    #parse conduit data into a dataframe
    conds_df = json_normalize(read_rpt['conduits']['features'])
    conds_df = df_clean(conds_df)

    #parse node data into a dataframe
    nodes_df = json_normalize(read_rpt['nodes']['features'])
    nodes_df = df_clean(nodes_df)

    #parse parcel data into a dataframe
    pars_df = json_normalize(read_rpt['parcels']['features'])
    pars_df = df_clean(pars_df)

    rpt_dict = {'conduits':conds_df, 'nodes':nodes_df, 'parcels':pars_df}
    rpt_dict.update()
    return {'conduits':conds_df, 'nodes':nodes_df, 'parcels':pars_df}
Example #3
0
def main():

    logger = get_root_logger()
    get_header(logger, 'LOADING PROJECTIONS')

    client = APIClient()

    # grab dataframe shape from a trial run
    data = client.get_data('weekly-projections', 'json', 'QB')
    test_df = json_normalize(data['Projections'])

    # get DF structure from columns in test_df
    cols = test_df.columns
    df = DataFrame(columns=cols)

    # grab current week
    current_week = test_df.week.values[0]

    # loop through all weeks up to current week
    for wk in [str(x) for x in range(int(current_week))]:
        logger.info('Processing projections for week {0}'.format(int(wk) + 1))
        # loop through all positions
        for pos in ['QB', 'RB', 'WR', 'TE', 'K', 'DEF']:
            tmp_data = client.get_data('weekly-projections', 'json', pos, wk)
            tmp_df = json_normalize(tmp_data['Projections'])
            df = df.append(tmp_df)

    # import this df directly to PG DB
    conn = DBClient()
    conn.load(df, 'projections', schema='raw', if_exists='replace')
Example #4
0
def find_top_major_project_themes(n):

    str_json = json.load((open('data/world_bank_projects.json')))
    normalized_df = json_normalize(str_json, 'mjtheme_namecode')

    # remove duplicates
    deduped_df = normalized_df.drop_duplicates()

    # create a dictionary of theme code and name
    project_theme_dict = {}
    for index, row in deduped_df.iterrows():
        theme_name = row['name']
        if(len(theme_name) > 0):
            project_theme_dict[row['code']] = theme_name

    theme_code_series = json_df['mjtheme_namecode']
    for item in theme_code_series.iteritems():
        theme_list = item[1]
        for theme in theme_list:
            theme_name = theme['name']
            if (len(theme_name) == 0):
                theme_code = theme['code']
                theme['name'] = str(project_theme_dict[theme_code])

    # print(json_df['mjtheme_namecode'].value_counts()[:10])

    # json_df['mjtheme_namecode'] = theme_code_series
    # print(json_df['mjtheme_namecode'].value_counts()[:10])
    #print(theme_code_series)

    str_json = json_df.to_string()
    normalized_df = json_normalize(str_json, 'mjtheme_namecode')
    print('\n\n' + str(normalized_df.code.value_counts()[:10]))
Example #5
0
    def test_json_normalize_errors(self):
        # GH14583: If meta keys are not always present
        # a new option to set errors='ignore' has been implemented
        i = {
            "Trades": [{
                "general": {
                    "tradeid": 100,
                    "trade_version": 1,
                    "stocks": [{

                        "symbol": "AAPL",
                        "name": "Apple",
                        "price": "0"
                    }, {
                        "symbol": "GOOG",
                        "name": "Google",
                        "price": "0"
                    }
                    ]
                }
            }, {
                "general": {
                    "tradeid": 100,
                    "stocks": [{
                        "symbol": "AAPL",
                        "name": "Apple",
                        "price": "0"
                    }, {
                        "symbol": "GOOG",
                        "name": "Google",
                        "price": "0"
                    }
                    ]
                }
            }
            ]
        }
        j = json_normalize(data=i['Trades'],
                           record_path=[['general', 'stocks']],
                           meta=[['general', 'tradeid'],
                                 ['general', 'trade_version']],
                           errors='ignore')
        expected = {'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''},
                    'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100},
                    'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'},
                    'price': {0: '0', 1: '0', 2: '0', 3: '0'},
                    'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}}

        assert j.fillna('').to_dict() == expected

        msg = ("Try running with errors='ignore' as key 'trade_version'"
               " is not always present")
        with pytest.raises(KeyError, match=msg):
            json_normalize(
                data=i['Trades'],
                record_path=[['general', 'stocks']],
                meta=[['general', 'tradeid'],
                      ['general', 'trade_version']],
                errors='raise')
    def search(self, query, max_tweets=200, remove_rts=True, hard_remove=True):

        # Search API only allows 100 tweets per page
        max_pages = int(max_tweets) // 100
        if max_pages < 1:
            max_pages = 1

        if max_tweets < 100:
            count = int(max_tweets)
        else:
            count = 100

        # Prepare query
        if remove_rts:
            query += ' -filter:retweets'
        if hard_remove:
            query += ' -RT'  # eliminates anything with RT, which may not always be a retweet

        # encoded_query = urllib.quote_plus(query)

        page = 0
        url = 'https://api.twitter.com/1.1/search/tweets.json'
        for i in range(max_pages):
            if page == 0:
                params = {'q': query, 'result_type': 'recent', 'count': count, 'lang': 'en'}
            else:
                max_id = data[-1]['id'] - 1
                params = {'q': query, 'result_type': 'recent', 'count': count, 'lang': 'en', 'max_id': max_id}

            r = requests.get(url, auth=self.auth, params=params)
            data = simplejson.loads(r.text)['statuses']

            if len(data) == 0:
                if self.verbose: print('No more results found')
                break

            if page == 0:
                df = json_normalize(data)
            else:
                df = df.append(json_normalize(data))

            page += 1

        # Check that all columns are there, if not add empty ones
        for col in self.columns:
            if col not in df.columns:
                df[col] = pd.Series([np.nan] * len(df), index=df.index)

        if len(self.tweets) == 0:
            self.tweets = df[self.columns]
        else:
            self.tweets = self.merge(df[self.columns])

        # Filter by location
        if self.track_location:
            if self.verbose: print('Filtering by location')
            self.get_geo()

        return
 def _stats(self,data):
     if len(data['intervals']) > 0:
         output = json_normalize(data,'intervals',
             ['system_id','total_devices']).set_index(['system_id',
                 'end_at'])
     else:
         output = json_normalize(data).set_index('system_id')
     return output
 def _monthly_production(self,data):
     if len(data['meter_readings']) > 0:
         output = json_normalize(data,'meter_readings',
             ['start_date','system_id','end_date','production_wh'])
     else:
         output = json_normalize(data,meta=
                 ['start_date','system_id','end_date','production_wh'])
     return output.set_index(['system_id','start_date','end_date'])
    def timeline(self, max_tweets=200, exclude_replies='true', include_rts='false'):
        """
            Load Twitter timeline for the specified user

            :param screen_name: Twitter screen name to process
            :param max_tweets: maximum number of tweets to get. (Default: 3200, the API maximum)
            :param exclude_replies: exclude replies? (Default: true)
            :param include_rts: include retweets? (Default: false)
            :return: pandas DataFrame of tweets
            """

        # API only allows up to 200 tweets per page

        max_pages = int(min(max_tweets, 3200) // 200)
        if max_pages < 1:
            max_pages = 1

        # Need to be strings not booleans
        if isinstance(exclude_replies, type(True)):
            exclude_replies = str(exclude_replies).lower()
        if isinstance(include_rts, type(True)):
            include_rts = str(include_rts).lower()

        if max_tweets < 200:
            count = int(max_tweets)
        else:
            count = 200

        page = 0
        url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
        for i in range(max_pages):
            if page == 0:
                params = {'screen_name': self.screen_name, 'count': count, 'lang': 'en',
                          'exclude_replies': exclude_replies, 'include_rts': include_rts}
            else:
                max_id = data[-1]['id'] - 1
                params = {'screen_name': self.screen_name, 'count': count, 'lang': 'en',
                          'exclude_replies': exclude_replies, 'include_rts': include_rts,
                          'max_id': max_id}

            r = requests.get(url, auth=self.auth, params=params)
            data = simplejson.loads(r.text)

            if page == 0:
                df = json_normalize(data)
            else:
                df = df.append(json_normalize(data), ignore_index=True)

            page += 1

        if len(self.tweets) == 0:
            self.tweets = df[self.columns]
        else:
            self.tweets = self.merge(df[self.columns])
Example #10
0
def get_dataset_details_for_classification_collection(context,classification,collection_name,api_key):
    collections = get_collections_for_classification(context,classification,api_key)
    collections = collections[collections.name == collection_name]
    urls = pandas.concat([json_normalize(utl.get(base_url + '/' + a)['json']) for a in collections['url']])
    #http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python
    urls = [url['href'] for urllist in urls['ons.collectionDetail.urls.url'].values for url in urllist if url['@representation']=='json']
    details = pandas.concat([json_normalize(utl.get(base_url + '/' + url)['json']) for url in urls])
    # get the english csvs
    details['file_url'] = [a['href']['$'] for a in details['ons.datasetDetail.documents.document'] for a in a if (a['@type']=='CSV') & (a['href']['@xml.lang']=='en')]
    details['geography'] = [b['$'] for b in details['ons.datasetDetail.geographicalHierarchies.geographicalHierarchy.names.name'] for b in b if b['@xml.lang'] =='en']
    details['collection_name'] = collection_name
    return details[['collection_name','geography','file_url']]
Example #11
0
    def test_json_normalize_errors(self, missing_metadata):
        # GH14583:
        # If meta keys are not always present a new option to set
        # errors='ignore' has been implemented

        msg = ("Try running with errors='ignore' as key 'name'"
               " is not always present")
        with pytest.raises(KeyError, match=msg):
            json_normalize(
                data=missing_metadata,
                record_path='addresses',
                meta='name',
                errors='raise')
Example #12
0
def df_from_json(data, **kwargs):
    """Attempt to produce row oriented data from hierarchical json/dict-like data."""

    if isinstance(data, str):
        with open(data) as data_file:
            data = json.load(data_file)

    if isinstance(data, list):
        return json_normalize(data, kwargs)
    elif isinstance(data, dict):
        for k, v in iteritems(data):
            if isinstance(v, list):
                return json_normalize(v)
Example #13
0
    def test_meta_name_conflict(self):
        data = [{'foo': 'hello',
                 'bar': 'there',
                 'data': [{'foo': 'something', 'bar': 'else'},
                          {'foo': 'something2', 'bar': 'else2'}]}]

        with pytest.raises(ValueError):
            json_normalize(data, 'data', meta=['foo', 'bar'])

        result = json_normalize(data, 'data', meta=['foo', 'bar'],
                                meta_prefix='meta')

        for val in ['metafoo', 'metabar', 'foo', 'bar']:
            assert val in result
def flatten_dictionary(returned_dictionary,review,call,df,commentDb,line): 
    if call == "keywords":
        first_level=json_normalize(returned_dictionary[review],call,['rating','sail_date',"ship","line"])
        second_level=json_normalize(returned_dictionary[review][call])
        together=pd.merge(first_level, second_level, on='text', how='outer')
        df=pd.concat([df, together])
    else:
        if review in commentDb["Msc"]:
            rating=commentDb["Msc"][review]["rating"]
            for element in returned_dictionary[review][call]:
                second_level=json_normalize(element)
                second_level['review']=review
                second_level['rating']=rating                
                df=pd.concat([df,second_level])
    return df 
Example #15
0
def j2c_spc(json_file_path):
    print json_file_path
    json_file = open(json_file_path)
    '''
        json_data = re.sub(r'}{','}\,{',json_file.read())
        data = '[' + json_data + ']'
        json_data = json.loads(data)
        print json_normalize(json_data)
        '''
    df = pd.DataFrame()
    for line in json_file:
        jsonobj_list = line.split('}{')
      
        for jsonobj in jsonobj_list:
            if re.search(r'^\{',jsonobj):
                jsonobj = jsonobj + '}'
                jsonrsp = json.loads(jsonobj)['response']
                if len(jsonrsp) != 0:
                    df = df.append(json_normalize(jsonrsp))
        
            elif re.search(r'\}$',jsonobj):
                jsonobj = '{' + jsonobj
                #data.append(json.loads(jsonobj)['response'])
                jsonrsp = json.loads(jsonobj)['response']
                if len(jsonrsp) != 0:
                    
                    df = df.append(json_normalize(jsonrsp))

            else:
                jsonobj = '{' + jsonobj +'}'
                #data.append(json.loads(jsonobj)['response'])
                jsonrsp = json.loads(jsonobj)['response']
                if len(jsonrsp) != 0:
                    
                    df = df.append(json_normalize(jsonrsp))

    df.to_csv('./hos_rawdata/hos422/doclist.csv',
              encoding='utf8',
              mode='a+',
              index=False,
              columns=[
                  'hospitalName',
                  'departmentName',
                  'doctorName',
                  'title',
                  'sex',
                  'specialty'
              ])
Example #16
0
def propose_ad_thread(ids, runid):


    rs = []

    headers = [5, 15, 35]
    adtypes = ['skyscraper', 'square', 'banner']
    colors = ['green', 'blue', 'red', 'black', 'white']
    productids = range(10, 25)

    # TODO create dataframe instead of proposing it on the fly
    
    for i in ids:
        rs.append(proposepage(i=i,
                              runid=runid,
                              header=random.choice(headers),
                              adtype=random.choice(adtypes),
                              color=random.choice(colors),
                              productid=random.choice(productids),
                              price=float(str(np.around(np.random.uniform(50), 2)))).json())
        time.sleep(1)

        # TODO SAVE PRICE
        
    df = json_normalize(rs)
    df.columns = ['Error', 'Success']
    df.to_csv("rewards" + str(runid) + "_" + str(ids[0]) + ".csv", index=False)
def import_data():
    r = requests.get('http://www.citibikenyc.com/stations/json')
    df = json_normalize(r.json()['stationBeanList'])
    #take the string and parse it into a Python datetime object
    exec_time = parse(r.json()['executionTime'])
    exec_time = exec_time.strftime('%x-%X')
    return r, df, exec_time
Example #18
0
    def test_shallow_nested(self):
        data = [{'state': 'Florida',
                 'shortname': 'FL',
                 'info': {
                     'governor': 'Rick Scott'
                 },
                 'counties': [{'name': 'Dade', 'population': 12345},
                              {'name': 'Broward', 'population': 40000},
                              {'name': 'Palm Beach', 'population': 60000}]},
                {'state': 'Ohio',
                 'shortname': 'OH',
                 'info': {
                     'governor': 'John Kasich'
                 },
                 'counties': [{'name': 'Summit', 'population': 1234},
                              {'name': 'Cuyahoga', 'population': 1337}]}]

        result = json_normalize(data, 'counties',
                                ['state', 'shortname',
                                 ['info', 'governor']])
        ex_data = {'name': ['Dade', 'Broward', 'Palm Beach', 'Summit',
                            'Cuyahoga'],
                   'state': ['Florida'] * 3 + ['Ohio'] * 2,
                   'shortname': ['FL', 'FL', 'FL', 'OH', 'OH'],
                   'info.governor': ['Rick Scott'] * 3 + ['John Kasich'] * 2,
                   'population': [12345, 40000, 60000, 1234, 1337]}
        expected = DataFrame(ex_data, columns=result.columns)
        tm.assert_frame_equal(result, expected)
Example #19
0
def gettweets(folder,dir_category):
    global features_list
    global features_df
    global row_count

    filename = "%s/%s/tweets.dump" % (dir_category,folder)
    jfile = codecs.open(filename,'rb','utf-8')
    
    break_counter =0 
    
    for jdoc in jfile:
        print("User: "******" - Tweets : ",row_count)

        ##break counter is tweet count
        if break_counter==100:
            break

        jvar = json.loads(jdoc)
        
        flat = flatten_json(jvar)
        norm=json_normalize(flat)
        
        features_df = pd.concat([features_df, norm], ignore_index=True)
        break_counter+=1

        if "label" not in list(features_df):
            features_df['label'] = str("NA")
        
        features_df.loc[row_count,'label']=label
        features_df.fillna("NA",inplace=True)
        
        row_count+=1
def split_link_queries(df):
    """
    df: pandas.DataFrame (insights_creatives_links merged)
        saves dataframe to be returned as csv
    returns: pandas.DataFrame (df with split link queries updated)
    """
    import json

    # transfrom dataframe into json
    json_string = df.to_json(orient="records")  # dataframe -> string
    json_list = json.loads(json_string)  # string -> list

    # update split link queries
    for record in json_list:
        if record["link"]:
            url = record["link"]
            query = url[url.find("?") + 1 :]
            queries = query.split("&")
            # update each record with split link queries
            pairs = {}  # pairs of query field & value
            for q in queries:
                pair = q.split("=")
                pairs[pair[0]] = pair[1]
            record.update(pairs)

    return json_normalize(json_list)
Example #21
0
def myvariant_post(hgvs_list):
    '''
    Query and Parser for myvariant.info
    Parses Raw Elastic Search results into
    pandas dataframe
    
    Parameters
    -------------
    hgvs_list: list, required
        
        
    Output
    -------------
    pandas df:
        normalized json of myvariant results
    '''
    

    
    if type(hgvs_list) == list:
        hgvs_list = ','.join(hgvs_list)
        
    assert type(hgvs_list) == str
    
    con = mv.getvariants(hgvs_list, fields='dbnsfp.cadd.phred,dbnsfp.genename')    

    mv_df = json_normalize(con)
    mv_df.index = mv_df['_id']
    

    return mv_df
def instagram_scraper(query, n):

    url = '{0}/tags/{1}/media/recent?client_id={2}&count=30'.format(
        base_url, query, CLIENT_ID)
    urls = list()
    results = list()

    urls.append(str(url))

    for _ in range(n):
        x = get(url)
        urls.append(str(x))
        url = get(x)

    for url in urls:

        r = requests.get(url)
        j = r.json()

        if 'data in j':
            try:
                data = j['data']
                df_instance = json_normalize(data)
                results.append(df_instance)
            except Exception, e:
                return 'Error: Could not find data.', str(e)
Example #23
0
def get_historical_prices(symbols, start_date=None, end_date=None):
    """
    Pulls historical prices for a given stock symbols and a given interval. Returns only about 500+ lines
    :param symbol: stock symbols list
    :param start_date: in 'yyyy-mm-dd'
    :param end_date: in 'yyyy-mm-dd'
    :return: modifies query result from json to pandas dataframe
    """

    if type(symbols) is str:
        symbol_string = symbols
    else:
        symbol_string = "','".join(symbols)

    end_date = dateutil.parser.parse(end_date) if end_date else date.today()
    start_date = dateutil.parser.parse(start_date) if start_date else end_date - dateutil.relativedelta.relativedelta(years = 1)

    print(str(start_date), type(start_date))
    print(str(end_date), type(start_date))

    yql_query = "SELECT * FROM {0} WHERE symbol IN ('{1}') AND startDate ='{2}' AND endDate ='{3}'".format(FINANCE_TABLES['history'], symbol_string, str(start_date), str(end_date))

    print(yql_query)
    query_result = execute_yql_query(yql_query)
    df = json_normalize(query_result['query']['results']['quote'])
    df.columns = [col_name.lower() for col_name in df.columns]
    return df
    def format_prices(self, prices, flag_calc_spread=True):
        """Format prices data as a DataFrame with hierarchical columns"""

        def cols(typ):
            return({
                'openPrice.%s' % typ: 'Open',
                'highPrice.%s' % typ: 'High',
                'lowPrice.%s' % typ: 'Low',
                'closePrice.%s' % typ: 'Close',
                'lastTradedVolume': 'Volume'                
            })

        df = json_normalize(prices)
        df = df.set_index('snapshotTime')
        df.index.name = 'DateTime'

        df_ask = df[['openPrice.ask', 'highPrice.ask', 'lowPrice.ask', 'closePrice.ask']]
        df_ask = df_ask.rename(columns=cols('ask'))

        df_bid = df[['openPrice.bid', 'highPrice.bid', 'lowPrice.bid', 'closePrice.bid']]
        df_bid = df_bid.rename(columns=cols('bid'))

        if flag_calc_spread:
            df_spread = df_ask - df_bid

        df_last = df[['openPrice.lastTraded', 'highPrice.lastTraded', 'lowPrice.lastTraded', 'closePrice.lastTraded', 'lastTradedVolume']]
        df_last = df_last.rename(columns=cols('lastTraded'))

        if not flag_calc_spread:
            df2 = pd.concat([df_bid, df_ask, df_last], axis=1, keys=['bid', 'ask', 'last'])
        else:
            df2 = pd.concat([df_bid, df_ask, df_spread, df_last], axis=1, keys=['bid', 'ask', 'spread', 'last'])
        return(df2)
Example #25
0
File: vote.py Project: dwipam/code
def handle_twitter():
	#Return data from Twitter
	ACCESS_TOKEN = '759067855123996673-SMh5suAmoGjFjLe9uGnT8kDjBAdygkJ'
	ACCESS_SECRET = 'mXd44Jg5QOkhKmO310ex4Zwabe6wEeApZnC2YEuKdHZVz'
	CONSUMER_KEY = 'pUIwbWWj9nqjQNRU4mioXHnCJ'
	CONSUMER_SECRET = 'ukObCLCVITbL1biri3jheZHsoVeq5iLVplKcsUa1EeczKB8d2G'
	#Read train data with sentiment positive and negative	
	oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
	twitter_stream = TwitterStream(auth=oauth)
	twitter = Twitter(auth=oauth)
	x = []
	for i in range(0,1):
		#Read tweets with Hillay Clinton statuses
		iterator = twitter_stream.statuses.filter(track="@HillaryClinton,", language = "en")
		x.append(iterator)
	
	#iterator = twitter.search.tweets(q='HillaryClinton', lang='en', count=10000)
	collectobj = []
	#Create a Data Frame from Json object with coloumns as id, User_name, Tweet_test, Location, probability
	for iterator in x:
		for tweet in iterator:
			if 'user' in tweet.keys() and tweet['user']['location']:
				#Get probability of a tweet being positive, negative or neutral
				topic = getprobtop(tweet['text'])
				obj = {'id' : tweet['user']['id'], 'User_name' : tweet['user']['screen_name'], \
			'Text': tweet['text'],'location' : tweet['user']['location'], \
			'status' : topic}
				print("Returning from prob")
				collectobj.append(obj)	
	
	table = json_normalize(collectobj)
	return(table)
def get_nosource_files_info(block):
    '''
    Seerch replicas for a given block and return the files that have no replica at all "[]" in replica field.
    The returned value is a dictionary having a panda data frame with the metainfo of the block,
    the count of non  source files and the total number of files for the block
    Only the files with a creation time of more than 1 week are reported
    '''
    url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/filereplicas'
    params = {"block": block}
    replicas_info = requests.get(url=url, params=params, verify=False).content
    replicas_json = json.loads(replicas_info)
    replicas_table = json_normalize(
        replicas_json['phedex']['block'][0]['file'])

    # Discards row entries of files with a creation date of one week or less
    replicas_table = replicas_table[replicas_table['time_create'].apply(
        check_datetime_Xweeks_older, nweeks=1) == True]

    num_files_in_block = len(replicas_table)
    no_source_files_table = replicas_table.loc[
        replicas_table.astype(str)['replica'] == "[]"]
    num_nosource_files_in_block = len(no_source_files_table)

    return {'df': no_source_files_table,
            'num_files_in_block': num_files_in_block,
            'num_nosource_files_in_block': num_nosource_files_in_block}
def mongo_to_dataframe(mongo_data):
    
            sanitized = json.loads(json_util.dumps(mongo_data))
            normalized = json_normalize(sanitized)
            df = pd.DataFrame(normalized)
    
            return df
Example #28
0
 def test_missing_meta(self, missing_metadata):
     # GH25468
     # If metadata is nullable with errors set to ignore, the null values
     # should be numpy.nan values
     result = json_normalize(
         data=missing_metadata,
         record_path='addresses',
         meta='name',
         errors='ignore')
     ex_data = [
         {'city': 'Massillon',
          'number': 9562,
          'state': 'OH',
          'street': 'Morris St.',
          'zip': 44646,
          'name': 'Alice'},
         {'city': 'Elizabethton',
          'number': 8449,
          'state': 'TN',
          'street': 'Spring St.',
          'zip': 37643,
          'name': np.nan}
     ]
     ex_data = [
         ['Massillon', 9562, 'OH', 'Morris St.', 44646, 'Alice'],
         ['Elizabethton', 8449, 'TN', 'Spring St.', 37643, np.nan]
     ]
     columns = ['city', 'number', 'state', 'street', 'zip', 'name']
     expected = DataFrame(ex_data, columns=columns)
     tm.assert_frame_equal(result, expected)
def get_urls(url, n):

      #return a next_url
    def get(url):
        return str(requests.get(url).json()['pagination']['next_url'])
    
      #open list to hold urls
    urls = list() 
    
      #handling initial url
    urls.append(str(url)) #add initial url to list

      #handling further urls    
    for _ in range(n):
        x = get(url) 
        urls.append(str(x)) #add next_url
        url = get(x) #replaces initial url with next_url for next turn in loop

      #open list to hold data
    results = list()

      #populate df with data from urls in urls
    for url in urls:
        results.append(json_normalize(requests.get(url).json()['data']))

      #initiate df
    df = pd.DataFrame().append(results).reset_index().drop('index',axis=1)
Example #30
0
    def test_meta_name_conflict(self):
        data = [{'foo': 'hello',
                 'bar': 'there',
                 'data': [{'foo': 'something', 'bar': 'else'},
                          {'foo': 'something2', 'bar': 'else2'}]}]

        msg = (r"Conflicting metadata name (foo|bar),"
               " need distinguishing prefix")
        with pytest.raises(ValueError, match=msg):
            json_normalize(data, 'data', meta=['foo', 'bar'])

        result = json_normalize(data, 'data', meta=['foo', 'bar'],
                                meta_prefix='meta')

        for val in ['metafoo', 'metabar', 'foo', 'bar']:
            assert val in result
Example #31
0
match_id_required = 22912
home_team_required = "Tottenham Hotspur"
away_team_required = "Liverpool"

# Load in the data and match events
file_name = str(match_id_required) + '.json'

import json
with open('Statsbomb/data/events/' + file_name) as data_file:
    #print (mypath+'events/'+file)
    data = json.load(data_file)

#Get the nested structure into a dataframe
from pandas.io.json import json_normalize

df = json_normalize(data, sep="_").assign(match_id=file_name[:-5])

#A dataframe of shots
shots = df.loc[df['type_name'] == 'Shot'].set_index('id')

#Draw the pitch
from FCPython import createPitch
(fig, ax) = createPitch(pitchLengthX, pitchWidthY, 'yards', 'gray')

#Plot the shots
for i, shot in shots.iterrows():
    x = shot['location'][0]
    y = shot['location'][1]

    goal = shot['shot_outcome_name'] == 'Goal'
    team_name = shot['team_name']
#trade from-to
import json
import inquirer
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from binance.client import Client
from datetime import datetime

client = Client('API_KEY', 'API_SECRET')

trades = client.get_all_orders(symbol='BTCUSDT')
trades = json_normalize(trades)
trades['data'] = pd.to_datetime(trades['time'], unit='ms')
trades['updateTime'] = pd.to_datetime(trades['updateTime'], unit='ms')
trades = trades[['time','data','price','side','status']]
trades = trades[trades.status != 'CANCELED']
trades['price'] = trades['price'].astype(float)
trades['side'] = trades['side'].astype(str)
trades = trades.reset_index(drop=True)

column = pd.Series(index = range(0,len(trades)))

for i in range(len(trades)):
    if trades['side'][i] == 'SELL':
        column[i] = round((trades['price'][i] - trades['price'][i-1])*100/trades['price'][i-1],2)
    else:
        column[i] = '0'

trades['perc'] = column
Example #33
0
url = "https://covid-19-coronavirus-statistics.p.rapidapi.com/v1/stats"

querystring = {"country": "US"}

headers = {
    'x-rapidapi-host': "covid-19-coronavirus-statistics.p.rapidapi.com",
    'x-rapidapi-key': "2d81c24244mshffe53b231648b51p1450f7jsn88b6e0713f7e"
}

response = requests.get(url, headers=headers, params=querystring)

response.json()

json_res = response.json()

df1 = json_normalize(json_res, ['data', 'covid19Stats'])

st.title('COVID19 Data (via API)')

st.subheader('Total Deaths:')
st.write(sum(df1.deaths))

st.subheader('Total Confirmed')
st.write(sum(df1.confirmed))

if st.checkbox('Show Data'):
    st.write(df1, height=1000, length=1000)

if st.checkbox('deaths'):
    c = alt.Chart(df1, width=1000,
                  height=1000).mark_bar(clip=True).encode(x='province',
Example #34
0
def TradeFlow(date_input):
    log_pnl_filename = "LogPnlRisk_" + date_input.strftime("%Y%m%d") + ".json"
    bt_pnl_filename = "ProfitRiskSeq_" + date_input.strftime(
        "%Y%m%d") + ".json"
    os.chdir("C:/Users/Yitong/AppData/Local/auto-option-mm/trades")
    if not path.exists(log_pnl_filename):
        return
    with open(log_pnl_filename) as json_file:
        cur_json = json.load(json_file)
        log_DF = json_normalize(cur_json)

    log_DF['timestamp'] = pd.to_datetime(log_DF['Pnl.Timestamp'])
    log_DF = log_DF.set_index('timestamp')

    if not path.exists(bt_pnl_filename):
        return
    with open(bt_pnl_filename) as json_file:
        cur_json = json.load(json_file)
        bt_DF = json_normalize(cur_json)

    bt_DF['timestamp'] = pd.to_datetime(bt_DF['Pnl.Timestamp'])
    bt_DF = bt_DF.set_index('timestamp')

    log_DF['Vega_norm'] = log_DF['Risk.Vega'] / log_DF['Profit.Volume'][-1]
    bt_DF['Vega_norm'] = bt_DF['Risk.Vega'] / bt_DF['Profit.Volume'][-1]

    log_DF['CD_norm'] = log_DF['Risk.CashDelta'] / log_DF['Profit.Volume'][-1]
    bt_DF['CD_norm'] = bt_DF['Risk.CashDelta'] / bt_DF['Profit.Volume'][-1]

    plt.close()
    os.chdir("C:/Users/Yitong/AppData/Local/auto-option-mm/trades")
    fig, ax1 = plt.subplots()
    color = 'tab:red'

    ax1.set_xlabel('time')
    ax1.set_ylabel('Vega/VolumeTot', color=color)
    ax1.plot(log_DF['Vega_norm'], color='deepskyblue', label='Log')
    ax1.plot(bt_DF['Vega_norm'], color='crimson', label='BackTest')

    ax1.set_title("VegaFlow " + str(date_input))
    ax1.legend(loc='lower right')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.set_ylabel('Vol',
                   color='black')  # we already handled the x-label with ax1
    ax2.plot(log_DF['Risk.AtmVol'], color='black', label='FitVol')
    # ax2.plot(TR_DF['Risk.AtmVol'], color='gold', label = 'TRVol')
    ax2.legend(loc='upper right')
    ax2.tick_params(axis='y', labelcolor='black')

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()
    plt.savefig("VegaFlow_" + str(date_input).replace("-", "") + ".png")

    plt.close()
    os.chdir("C:/Users/Yitong/AppData/Local/auto-option-mm/trades")
    fig, ax1 = plt.subplots()
    color = 'tab:red'

    ax1.set_xlabel('time')
    ax1.set_ylabel('CashDelta/VolumeTot', color=color)
    ax1.plot(log_DF['CD_norm'], color='deepskyblue', label='Log')
    ax1.plot(bt_DF['CD_norm'], color='crimson', label='BackTest')

    ax1.set_title("CashDeltaFlow " + str(date_input))
    ax1.legend(loc='lower right')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.set_ylabel('Vol',
                   color='black')  # we already handled the x-label with ax1
    ax2.plot(log_DF['Risk.Spot'], color='black', label='SpotPrice')
    # ax2.plot(TR_DF['Risk.AtmVol'], color='gold', label = 'TRVol')
    ax2.legend(loc='upper right')
    ax2.tick_params(axis='y', labelcolor='black')

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()
    plt.savefig("CashDeltaFlow_" + str(date_input).replace("-", "") + ".png")
    return
Example #35
0
        #pass
        #print(json.loads(response.text))
        response = session.post('https://direct.pinpoll.com/v2/vote',
                                headers=headers,
                                data=data)
        r = json.loads(response.text)
        r["timestamp"] = datetime.now()
        results.append(r)
        print(r["result"][-1]["votes"])
    except Exception as e:
        print(e)
        #print(response.text)
    t.sleep(2)  #Don't set this to zero, we dont want to DDOS the server.

data
data = json_normalize(results)
votes = data["result"].apply(
    lambda x: pd.DataFrame(x).transpose().rename(columns={
        0: "384232",
        1: "384233",
        2: "384234",
        3: "384235"
    }).drop("id"))
votes = pd.concat(votes.to_list())
data = pd.concat([data, votes.reset_index(drop=True)], axis=1)
data.set_index("timestamp")[['384232', '384233', '384234', '384235']].plot()
data.to_csv("data.csv")
data[data["timestamp"] < "2020-05-23"][[
    '384232', '384233', '384234', '384235'
]].diff().sum()
data[data["timestamp"] >= "2020-05-23"][[
Example #36
0
def load_json_bq(batch):
    #print(datetime.utcnow())
    #print(batch)
    json_data = []
    for f in batch.split(
            '|'):  #this is what would happen for each file in a batch!!!!
        success = 0
        try_count = 1
        while success == 0 and try_count <= 5:
            try:
                blob = BUCKET.get_blob(f)
                json_data.extend(json.loads(blob.download_as_string()))
                success = 1
            except Exception as e:
                if try_count == 5:  #on last try iteration exit function with empty list
                    db_logger.error('Batch Failed: {}'.format(batch))
                    f_logger.error('Batch Failed: {}'.format(batch),
                                   exc_info=True,
                                   stack_info=True)
                    return
                time.sleep(1 + try_count)
                try_count += 1
    #get batch timestamp:
    batch_dt = datetime.now()

    #Load to BQ:
    #customers table:
    table_name = 'customers'
    table_columns = [
        'accepts_marketing', 'admin_graphql_api_id', 'created_at', 'currency',
        'email', 'first_name', 'id', 'last_name', 'last_order_id',
        'last_order_name', 'multipass_identifier', 'note', 'orders_count',
        'phone', 'state', 'tags', 'tax_exempt', 'total_spent', 'updated_at',
        'verified_email'
    ]
    df = json_normalize(json_data)
    if not df.empty:
        #format column names:
        df.columns = [x.strip().replace('.', '_') for x in df.columns]
        #only selected columns:
        df_schema = pd.DataFrame(
            df, columns=table_columns)  #df.loc[:,table_columns]
        df_schema['ods_inserted_at'] = batch_dt
        del df
        #convert datatypes:
        df_schema['accepts_marketing'] = df_schema['accepts_marketing'].astype(
            'O')
        df_schema['admin_graphql_api_id'] = df_schema[
            'admin_graphql_api_id'].astype('O')
        df_schema['currency'] = df_schema['currency'].astype('O')
        df_schema['email'] = df_schema['email'].astype('O')
        df_schema['first_name'] = df_schema['first_name'].astype('O')
        df_schema['last_name'] = df_schema['last_name'].astype('O')
        df_schema['last_order_name'] = df_schema['last_order_name'].astype('O')
        df_schema['multipass_identifier'] = df_schema[
            'multipass_identifier'].astype('O')
        df_schema['note'] = df_schema['note'].astype('O')
        df_schema['phone'] = df_schema['phone'].astype('O')
        df_schema['state'] = df_schema['state'].astype('O')
        df_schema['tags'] = df_schema['tags'].astype('O')
        df_schema['tax_exempt'] = df_schema['tax_exempt'].astype('O')
        df_schema['verified_email'] = df_schema['verified_email'].astype('O')
        df_schema['id'] = df_schema['id'].astype('int')
        df_schema['created_at'] = pd.to_datetime(df_schema['created_at'])
        df_schema['updated_at'] = pd.to_datetime(df_schema['updated_at'])
        df_schema['orders_count'] = df_schema['orders_count'].fillna(0).astype(
            'int')
        df_schema['last_order_id'] = df_schema['last_order_id'].fillna(
            0).astype('int')
        df_schema['total_spent'] = df_schema['total_spent'].fillna(0).astype(
            'float64')

        success = 0
        try_count = 1
        while success == 0 and try_count <= 5:
            try:
                #upload data to table:
                df_schema.to_gbq('{}.{}'.format(DATASET_ID, table_name),
                                 PROJECT_ID,
                                 chunksize=None,
                                 if_exists='append',
                                 private_key=SERVICE_ACCOUNT_KEY_FILE)
                success = 1
            except Exception as e:
                if try_count == 5:  #on last try iteration exit function with empty list
                    db_logger.error('Table {} load failed: batch - {}'.format(
                        table_name, batch))
                    f_logger.error('Table {} load failed: batch - {}'.format(
                        table_name, batch),
                                   exc_info=True,
                                   stack_info=True)
                    return
                time.sleep(1 + try_count)
                try_count += 1

    #customer_address table:
    table_name = 'customer_address'
    table_columns = [
        'id', 'customer_id', 'customer_updated_at', 'first_name', 'last_name',
        'address1', 'address2', 'city', 'company', 'country', 'country_code',
        'country_name', 'province', 'province_code', 'zip', 'phone', 'name',
        'default'
    ]
    df = json_normalize(json_data,
                        'addresses', ['updated_at'],
                        meta_prefix='customer_')

    if not df.empty:
        #format column names:
        df.columns = [x.strip().replace('.', '_') for x in df.columns]
        #only selected columns:
        df_schema = pd.DataFrame(
            df, columns=table_columns)  #df.loc[:,table_columns]
        df_schema['ods_inserted_at'] = batch_dt
        del df
        #convert column datatypes:
        df_schema['first_name'] = df_schema['first_name'].astype('O')
        df_schema['last_name'] = df_schema['last_name'].astype('O')
        df_schema['address1'] = df_schema['address1'].astype('O')
        df_schema['address2'] = df_schema['address2'].astype('O')
        df_schema['city'] = df_schema['city'].astype('O')
        df_schema['company'] = df_schema['company'].astype('O')
        df_schema['country'] = df_schema['country'].astype('O')
        df_schema['country_code'] = df_schema['country_code'].astype('O')
        df_schema['country_name'] = df_schema['country_name'].astype('O')
        df_schema['province'] = df_schema['province'].astype('O')
        df_schema['province_code'] = df_schema['province_code'].astype('O')
        df_schema['zip'] = df_schema['zip'].astype('O')
        df_schema['phone'] = df_schema['phone'].astype('O')
        df_schema['name'] = df_schema['name'].astype('O')
        df_schema['id'] = df_schema['id'].astype('int')
        df_schema['customer_id'] = df_schema['customer_id'].astype('int')
        df_schema['customer_updated_at'] = pd.to_datetime(
            df_schema['customer_updated_at'])
        df_schema['default'] = df_schema['default'].astype('bool')

        success = 0
        try_count = 1
        while success == 0 and try_count <= 5:
            try:
                #upload data to table:
                df_schema.to_gbq('{}.{}'.format(DATASET_ID, table_name),
                                 PROJECT_ID,
                                 chunksize=None,
                                 if_exists='append',
                                 private_key=SERVICE_ACCOUNT_KEY_FILE)
                success = 1
            except Exception as e:
                if try_count == 5:  #on last try iteration exit function with empty list
                    db_logger.error('Table {} load failed: batch - {}'.format(
                        table_name, batch))
                    f_logger.error('Table {} load failed: batch - {}'.format(
                        table_name, batch),
                                   exc_info=True,
                                   stack_info=True)
                    return
                time.sleep(1 + try_count)
                try_count += 1
    #print('{} loaded'.format(table_name))
    #rename processed files:
    for f in batch.split(
            '|'):  #this is what would happen for each file in a batch!!!!
        new_name = '{}/{}'.format(PROCCESSED_PREFIX, f.split('/')[-1])
        success = 0
        try_count = 1
        while success == 0 and try_count <= 5:
            try:
                blob = BUCKET.get_blob(f)
                BUCKET.rename_blob(blob, new_name, client=CLIENT)
                success = 1
            except Exception as e:
                if try_count == 5:  #on last try iteration exit function with empty list
                    db_logger.error(
                        'Blob {} rename failed: batch - {}, '.format(f, batch))
                    f_logger.error(
                        'Blob {} rename failed: batch - {}, '.format(f, batch),
                        exc_info=True,
                        stack_info=True)
                    return
                time.sleep(1 + try_count)
                try_count += 1
def json_to_df(filename):
    df_enriched = json_normalize(pd.Series(open(filename).readlines()).apply(json.loads))
    return df_enriched
def update_database():
    print('Fetching and updating 1')
    apiResponse = get('https://api.covid19india.org/raw_data.json')
    # print("Processes", apiResponse.status_code)

    #print(apiResponse)
    if (apiResponse.status_code == 200):
        raw_data = apiResponse.json()
        raw_data = raw_data['raw_data']
        # JSON to dataframe
        data = json_normalize(raw_data)
        data = data.rename(
            columns={
                "patientnumber": "ID",
                "statepatientnumber": "Government id",
                "dateannounced": "Diagnosed date",
                "agebracket": "Age",
                "gender": "Gender",
                "detectedcity": "Detected city",
                "detecteddistrict": "Detected district",
                "detectedstate": "Detected state",
                "nationality": "Nationality",
                "currentstatus": "Current status",
                "statuschangedate": "Status change date",
                "_d180g": "Notes",
                "backupnotes": "Backup notes",
                "contractedfromwhichpatientsuspected":
                "Contracted from which Patient (Suspected)",
                "estimatedonsetdate": "Estimated on set date",
                "source1": "Source 1",
                "source2": "Source 2",
                "source3": "Source 3"
            })

        # changing nationality Indian to India
        for ind in data.index:
            if (data['Nationality'][ind] == "Indian"):
                data['Nationality'][ind] = "India"

        # converting the string values to datetime object
        data['Diagnosed date'] = pd.to_datetime(data['Diagnosed date'],
                                                dayfirst=True)
        data['Status change date'] = pd.to_datetime(data['Status change date'],
                                                    dayfirst=True)

        # replacing all the missing values with unknown
        data.replace(to_replace="", value="unknown", inplace=True)
        # creating new columns depicting the current status of patient
        data['recovered'] = 0
        data['active'] = 0
        data['death'] = 0
        data['unknown'] = 0
        data['confirmed'] = 1

        for status in data.index:
            if (data['Current status'][status] == "Hospitalized"):
                data['active'][status] = 1
            elif (data['Current status'][status] == "Recovered"):
                data['recovered'][status] = 1
            elif (data['Current status'][status] == "Deceased"):
                data['death'][status] = 1
            else:
                data['unknown'][status] = 1

        data.to_csv(file_loc + './data/data.csv',
                    index=False,
                    date_format="%Y-%m-%d %H:%M:%S")
        #print( 'raw data complete' )

    else:
        print("Connection error")
Example #39
0
def create_issues_df(owner, repo, api):
    issues_list = issues_of_repo_github(owner, repo, api)
    return json_normalize(issues_list)
from urllib.request import urlopen
import json
from pandas.io.json import json_normalize
import pandas as pd, numpy as np
from bs4 import BeautifulSoup as bs

with open("epamglobal.txt", 'r') as f:
    links = f.readlines()

result = pd.DataFrame()
for i in range(len(links)):
    try:
        page = urlopen(links[i]).read()
        data = bs(page, 'html.parser')
        body = data.find('body')
        script = body.find('script')
        raw = script.text.strip().replace('window._sharedData =', '').replace(';', '')
        json_data = json.loads(raw)
        posts = json_data['entry_data']['PostPage'][0]['graphql']
        posts = json.dumps(posts)
        posts = json.loads(posts)
        x = pd.DataFrame.from_dict(json_normalize(posts), orient='columns')
        x.columns = x.columns.str.replace("shortcode_media.", "")
        result = result.append(x)

    except:
        np.nan

result = result.drop_duplicates(subset='shortcode')
result.index = range(len(result.index))
Example #41
0
    def test_more_deeply_nested(self):
        data = [{
            'country':
            'USA',
            'states': [{
                'name':
                'California',
                'cities': [{
                    'name': 'San Francisco',
                    'pop': 12345
                }, {
                    'name': 'Los Angeles',
                    'pop': 12346
                }]
            }, {
                'name':
                'Ohio',
                'cities': [{
                    'name': 'Columbus',
                    'pop': 1234
                }, {
                    'name': 'Cleveland',
                    'pop': 1236
                }]
            }]
        }, {
            'country':
            'Germany',
            'states': [{
                'name': 'Bayern',
                'cities': [{
                    'name': 'Munich',
                    'pop': 12347
                }]
            }, {
                'name':
                'Nordrhein-Westfalen',
                'cities': [{
                    'name': 'Duesseldorf',
                    'pop': 1238
                }, {
                    'name': 'Koeln',
                    'pop': 1239
                }]
            }]
        }]

        result = json_normalize(data, ['states', 'cities'],
                                meta=['country', ['states', 'name']])
        # meta_prefix={'states': 'state_'})

        ex_data = {
            'country': ['USA'] * 4 + ['Germany'] * 3,
            'states.name': [
                'California', 'California', 'Ohio', 'Ohio', 'Bayern',
                'Nordrhein-Westfalen', 'Nordrhein-Westfalen'
            ],
            'name': [
                'San Francisco', 'Los Angeles', 'Columbus', 'Cleveland',
                'Munich', 'Duesseldorf', 'Koeln'
            ],
            'pop': [12345, 12346, 1234, 1236, 12347, 1238, 1239]
        }

        expected = DataFrame(ex_data, columns=result.columns)
        tm.assert_frame_equal(result, expected)
Example #42
0
# In[12]:

len(df3.id.unique())
#df3.id.unique()[1:3]

# In[17]:

detalhe_deputados = []
for id in df3.id.unique():  #df3.id.unique()[1:3]:
    print(id)
    #Detalhe do deputado:
    request_detalhe = requests.get(
        'https://dadosabertos.camara.leg.br/api/v2/deputados/{id!s}'.format(
            id=id))
    json_detalhe = json.loads(request_detalhe.text)
    df_detalhado = json_normalize(json_detalhe['dados'])
    detalhe_deputados.append(df_detalhado)
#Solicitação de reembolso do deputado:
#vai ser aqui....

# In[20]:

dicionario_deputados_detalhe = concat(detalhe_deputados,
                                      ignore_index=False,
                                      sort=True)
dicionario_deputados_detalhe

# In[21]:

print(engine.table_names())
Example #43
0
    def __call__(self, query_date, grafana_dict, idb_dict, tag):
        GRAFANA_HOST = grafana_dict['GRAFANA_HOST']
        GRAFANA_REQUEST_ANNO_QUERY = '/api/annotations'
        GRAFANA_USERNAME = grafana_dict['GRAFANA_USERNAME']
        GRAFANA_PASSWORD = grafana_dict['GRAFANA_PASSWORD']
        GRAFANA_FROM = query_date['DATE_FROM'] + '000'
        GRAFANA_TO = query_date['DATE_TO'] + '000'
        GRAFANA_TAG1 = grafana_dict['GRAFANA_TAG1']
        GRAFANA_TAG2 = grafana_dict['GRAFANA_TAG2']
        #GRAFANA_PANEL_ID = '16'
        #GRAFANA_DASHBOARD_ID = '15'

        IDB_HOST = idb_dict['IDB_HOST']
        IDB_PORT = idb_dict['IDB_PORT']
        IDB_DBNAME = idb_dict['IDB_DBNAME']
        IDB_CHANNEL = tag
        IDB_USER = idb_dict['IDB_USER']
        IDB_PASSWORD = idb_dict['IDB_PASSWORD']
        KEYWORD = ''

        def read_influxdb_data(host='192.168.123.245',
                               port=8086,
                               dbname='c9377a95-82f3-4af3-ac14-40d14f6d2abe',
                               ChannelName='1Y520210100',
                               time_start='',
                               time_end='',
                               user='******',
                               password='******',
                               keyword=''):

            client = DataFrameClient(host, port, user, password, dbname)
            measurements = client.get_list_measurements()

            if keyword is None: keyword = ''

            if keyword == '':
                measurement = [
                    mea.get(u'name') for mea in measurements
                    if mea.get(u'name').find(ChannelName) >= 0
                ]
            else:
                measurement = [
                    mea.get(u'name') for mea in measurements
                    if mea.get(u'name').find(ChannelName) >= 0
                    and mea.get(u'name').find(keyword) >= 0
                ]

            if len(measurement) == 0:
                print('No data retrieved.')
                return None

            measurement = measurement[-1]
            time_end = 'now()' if time_end == '' else "'" + time_end + "'"
            time_start = 'now()' if time_start == '' else "'" + time_start + "'"
            querystr = 'select * from "{}" where time > {} and time < {}'.format(
                measurement, time_start, time_end)
            #print(querystr)

            df = client.query(querystr).get(measurement)
            client.close()

            if df is None:
                print('InfluxDB no data retrieved.')
                return None

            dff = df.groupby('id')
            columns = [name for name, group in dff]
            groups = [group['val'] for name, group in dff]

            #check datatime alginment: all([all(groups[i].index==groups[0].index) for i in range(1,len(groups))])
            result = pd.concat(groups, axis=1)
            result.columns = columns
            result.index = groups[0].index

            return measurement, result

        def encode_base64(username, password):
            str_user = username + ':' + password
            str_user_byte = str_user.encode('utf8')  # string to byte
            str_user_encode64 = base64.b64encode(
                str_user_byte)  # encode by base64
            str_user_string = str_user_encode64.decode(
                'utf8')  # byte to string
            str_auth = 'Basic ' + str(str_user_string)
            return str_auth

        ## Request Annotation list from Grafana ##
        headers = {
            "Accept": "application/json",
            "Content-Type": "application/json",
            "Authorization": encode_base64(GRAFANA_USERNAME, GRAFANA_PASSWORD)
        }

        url = GRAFANA_HOST + GRAFANA_REQUEST_ANNO_QUERY + '?' +\
                                                          '&tags=' + GRAFANA_TAG1 +\
                                                          '&tags=' + GRAFANA_TAG2 +\
                                                          '&from=' + GRAFANA_FROM +\
                                                          '&to=' + GRAFANA_TO

        print(url)
        print(headers)

        req = requests.get(url, headers=headers)
        req_data_json = req.json()
        req_data_pd = json_normalize(req_data_json)

        #print (req_data_pd)

        # which means grafana retrieve no data from API
        #if 'timestamp' not in req_data_pd.index:
        #    return 'no data retrieve from Grafana'

        #GMT+8
        annotation = req_data_pd[['regionId', 'tags', 'time', 'email']]
        annotation = annotation.sort_values(by=['regionId', 'time'])
        annotation['time'] = pd.to_datetime(annotation['time'], unit='ms')
        annotation.rename(index=str,
                          columns={'time': 'timestamp'},
                          inplace=True)

        # Remove in-duplicate rows
        anno_dup_list = annotation.set_index('regionId').index.get_duplicates()
        annotation = annotation.loc[annotation['regionId'].isin(anno_dup_list)]

        ## Request SCADA eigen value from InfluxDB ##
        #scada_idb = pd.read_csv('test.csv')
        #scada_idb['Unnamed: 0'] = scada_idb['Unnamed: 0'].astype(str).str[:-6]
        #scada_idb.rename(index=str, columns={'Unnamed: 0': 'timestamp'}, inplace=True)
        #cada_idb = scada_idb.sort_values(by=['timestamp'])

        IDB_TIME_START = datetime.datetime.fromtimestamp(
            int(query_date['DATE_FROM'])).strftime('%Y-%m-%d %H:%M:%S')
        IDB_TIME_END = datetime.datetime.fromtimestamp(
            int(query_date['DATE_TO'])).strftime('%Y-%m-%d %H:%M:%S')

        measurement, scada_idb = read_influxdb_data(host=IDB_HOST,
                                                    port=IDB_PORT,
                                                    dbname=IDB_DBNAME,
                                                    ChannelName=IDB_CHANNEL,
                                                    time_start=IDB_TIME_START,
                                                    time_end=IDB_TIME_END,
                                                    user=IDB_USER,
                                                    password=IDB_PASSWORD)
        scada_idb['timestamp'] = scada_idb.index
        scada_idb['timestamp'] = scada_idb['timestamp'].astype(str).str[:-6]

        # which means SCADA retrieve no data from InfluxDB
        if len(scada_idb['timestamp']) == 0:
            return 'no data retrieve from SCADA'

        ## Align SCADA and Grafana Dataframe ##
        label_df = pd.DataFrame()
        for regionID in annotation.regionId.unique():

            tags_list = annotation[annotation['regionId'] ==
                                   regionID]['tags'].iloc[0]
            mail = annotation[annotation['regionId'] ==
                              regionID]['email'].iloc[0]

            label_start_time = str(annotation[annotation['regionId'] ==
                                              regionID]['timestamp'].iloc[0])
            label_end_time = str(annotation[annotation['regionId'] == regionID]
                                 ['timestamp'].iloc[1])
            label_start_time = datetime.datetime.strptime(
                label_start_time, '%Y-%m-%d %H:%M:%S')
            label_end_time = datetime.datetime.strptime(
                label_end_time, '%Y-%m-%d %H:%M:%S')

            # loop scada_idb, if timestamp + 8 hours + 6 days(testing) > start and < end, set value of tags
            for i in range(len(scada_idb['timestamp'])):
                datetime_object = datetime.datetime.strptime(
                    scada_idb['timestamp'][i], '%Y-%m-%d %H:%M:%S')
                #datetime_object = datetime_object + datetime.timedelta(hours=8) + datetime.timedelta(days=7)
                #label_df.at[i, tags]= 0
                label_df.at[i, 'timestamp'] = scada_idb['timestamp'][i]

                #if (datetime_object > label_start_time) and (datetime_object < label_end_time):
                #    for num, tags in enumerate(tags_list):
                #        #print (tags)
                #        label_df.at[i, tags] = 1
                #        label_df.at[i, mail] = 1
                #else:
                #    for num, tags in enumerate(tags_list):
                #        #print (tags)
                #        label_df.at[i, tags] = -1
                #        label_df.at[i, mail] = -1

                if (datetime_object > label_start_time) and (datetime_object <
                                                             label_end_time):
                    for num, tags in enumerate(tags_list):
                        #print (tags)
                        label_df.at[i, tags] = 1
                        label_df.at[i, mail] = 1

            #print (label_start_time, label_end_time, datetime_object)

        #label_df.drop('x', axis=1, inplace=True)
        output_df = pd.merge(scada_idb, label_df.fillna(-1), on=['timestamp'])
        output_df.drop([GRAFANA_TAG2], axis=1, inplace=True)

        return output_df
#Database and Container names - should be consistent across runs
database_name = '<database-name>'
container_name = '<container-name>'

#Create the Cosmos client; Connect to database and container of interest
client = cosmos_client.CosmosClient(url, key)
database = client.get_database_client(database_name)
container = database.get_container_client(container_name)

# Execute query to pull in all JSON items and all fields into single DataFrame:
output = pd.DataFrame()
for item in container.query_items(
        query='SELECT * FROM c where startswith(c.id, "' + WF_ID +
        '") AND c.state = "COMPLETE"',
        enable_cross_partition_query=True):
    flat = json_normalize(fj.flatten_json(item))
    output = output.append(flat, ignore_index=True)
    # print(output)

# Create new DataFrame with only fields of interest
of_interest = output[[
    "id",
    "state",
    "name",
    "description",
    "resources_cpu_cores",
    "resources_preemptible",
    "resources_ram_gb",
    "resources_disk_gb",
    # Headers for WGS runs:
    'logs_0_logs_0_stdout',
Example #45
0
    def test_json_normalize_errors(self):
        # GH14583: If meta keys are not always present
        # a new option to set errors='ignore' has been implemented
        i = {
            "Trades": [{
                "general": {
                    "tradeid":
                    100,
                    "trade_version":
                    1,
                    "stocks": [{
                        "symbol": "AAPL",
                        "name": "Apple",
                        "price": "0"
                    }, {
                        "symbol": "GOOG",
                        "name": "Google",
                        "price": "0"
                    }]
                }
            }, {
                "general": {
                    "tradeid":
                    100,
                    "stocks": [{
                        "symbol": "AAPL",
                        "name": "Apple",
                        "price": "0"
                    }, {
                        "symbol": "GOOG",
                        "name": "Google",
                        "price": "0"
                    }]
                }
            }]
        }
        j = json_normalize(data=i['Trades'],
                           record_path=[['general', 'stocks']],
                           meta=[['general', 'tradeid'],
                                 ['general', 'trade_version']],
                           errors='ignore')
        expected = {
            'general.trade_version': {
                0: 1.0,
                1: 1.0,
                2: '',
                3: ''
            },
            'general.tradeid': {
                0: 100,
                1: 100,
                2: 100,
                3: 100
            },
            'name': {
                0: 'Apple',
                1: 'Google',
                2: 'Apple',
                3: 'Google'
            },
            'price': {
                0: '0',
                1: '0',
                2: '0',
                3: '0'
            },
            'symbol': {
                0: 'AAPL',
                1: 'GOOG',
                2: 'AAPL',
                3: 'GOOG'
            }
        }

        self.assertEqual(j.fillna('').to_dict(), expected)

        self.assertRaises(KeyError,
                          json_normalize,
                          data=i['Trades'],
                          record_path=[['general', 'stocks']],
                          meta=[['general', 'tradeid'],
                                ['general', 'trade_version']],
                          errors='raise')
Example #46
0
#     pandas_Bin_result = json_normalize(json_Bin_result)
#     return pandas_Bin_result

url = "https://api.binance.com"
res = requests.get(url + "/api/v3/ticker/price", params={'symbol': "BTCUSDT"})
Bin_result = res.json()
#print(type(Bin_result))
#print(Bin_result[0]['price'])
#print(Bin_result[0]['symbol'])
#print("--------------------여기는 Bin_result의 값입니다--------------------")
#print(type(Bin_result['price']))
# Bin_result['price']는 string이므로 float 변환
binance_price = float(Bin_result['price'])
Bin_result['price'] = binance_price
str_Bin_result = json.dumps(Bin_result)
json_Bin_result = json.loads(str_Bin_result)
pandas_Bin_result = json_normalize(json_Bin_result)

# 1. 열 추가 방법
# 1-1. 방법1
# city = ['32424']
# pandas_Bin_result['city'] = city
# 1-2. 방법2
# df = df.assign(city = ['Lahore'])
#print(df)

#210406
#데이터 순서(symbol,price)
#변수명 변경 완료
#데이터 불러오기 완료
#함수로 변환하는 작업 필요
Example #47
0
if os.path.exists(arg1):

    l = os.listdir(arg1)
    for each_file in l:
        if each_file.endswith(".json"):
            print("Iteration")
            file_list.append(each_file)

else:
    print("Not found")
    sys.exit()

head = []
with open("22combine.json", "w") as outfile:
    for f in file_list:
        with open(f, 'rb') as infile:
            file_data = json.load(infile)
            head += file_data
    json.dump(head, outfile)

outfile.close()

with open("22combine.json", 'r') as file1:

    data1 = json.load(file1)
    data2 = json_normalize(data1)

    data2.to_csv("22result.csv", mode="a", index=False)

file1.close()
Example #48
0
               'project_name']].groupby('countryname').agg('count')

# In[107]:

dataQ1Sorted = dataQ1.sort_values('project_name',
                                  ascending=False,
                                  inplace=False)

# In[109]:

dataQ1Sorted.head(10)

# In[204]:

#dataQ2  = data[['_id','mjtheme_namecode']]
json_normalize(data, 'mjtheme_namecode')

# In[197]:

# In[187]:

dataQ2['mjtheme_namecode'].array

# In[181]:

flatten(dataQ2)

# In[154]:

print(dataQ2['mjtheme_namecode'].head())
def get_variants_by(filter_by, search_term, dataset, mode, timeout=None):

    query_for_transcripts = """
    {
        transcript(transcript_id: "%s", reference_genome: %s) {
            transcript_id,
            transcript_version,        
            gene {
            gene_id,
            symbol,
            start,
            stop,
            strand,
            chrom,
            hgnc_id,
            gene_name,
            full_gene_name,
            omim_id
            }
            variants(dataset: %s) {
            pos
            rsid
            ref
            alt
            consequence
            genome {
            genome_af:af
            genome_ac:ac
            genome_an:an
            genome_ac_hemi:ac_hemi
            genome_ac_hom:ac_hom
            }
            exome {
            exome_af:af
            exome_ac:ac
            exome_an:an
            exome_ac_hemi:ac_hemi
            exome_ac_hom:ac_hom
            }
            flags
            lof
            consequence_in_canonical_transcript
            gene_symbol
            hgvsc
            lof_filter
            lof_flags
            hgvsc
            hgvsp
            reference_genome
            variant_id: variantId
            }
            gtex_tissue_expression{
            adipose_subcutaneous,
            adipose_visceral_omentum,
            adrenal_gland,
            artery_aorta,
            artery_coronary,
            artery_tibial,
            bladder,
            brain_amygdala,
            brain_anterior_cingulate_cortex_ba24,
            brain_caudate_basal_ganglia,
            brain_cerebellar_hemisphere,
            brain_cerebellum,
            brain_cortex,
            brain_frontal_cortex_ba9,
            brain_hippocampus,
            brain_hypothalamus,
            brain_nucleus_accumbens_basal_ganglia,
            brain_putamen_basal_ganglia,
            brain_spinal_cord_cervical_c_1,
            brain_substantia_nigra,
            breast_mammary_tissue,
            cells_ebv_transformed_lymphocytes,
            cells_transformed_fibroblasts,
            cervix_ectocervix,
            cervix_endocervix,
            colon_sigmoid,
            colon_transverse,
            esophagus_gastroesophageal_junction,
            esophagus_mucosa,
            esophagus_muscularis,
            fallopian_tube,
            heart_atrial_appendage,
            heart_left_ventricle,
            kidney_cortex,
            liver,
            lung,
            minor_salivary_gland,
            muscle_skeletal,
            nerve_tibial,
            ovary,
            pancreas,
            pituitary,
            prostate,
            skin_not_sun_exposed_suprapubic,
            skin_sun_exposed_lower_leg,
            small_intestine_terminal_ileum,
            spleen,
            stomach,
            testis,
            thyroid,
            uterus,
            v****a,
            whole_blood
            }
            clinvar_variants{
                variant_id,
                clinvar_variation_id,
                reference_genome,
                chrom,
                pos,
                ref,
                alt,
                clinical_significance,
                gold_stars,
                major_consequence,
                review_status
            }
            coverage(dataset: %s){
              genome{
                pos,
                mean,
                median,
                over_1,
                over_5,
                over_10,
                over_15,
                over_20,
                over_25,
                over_30,
                over_50,
                over_100
              }

              exome{
                pos,
                mean,
                median,
                over_1,
                over_5,
                over_10,
                over_15,
                over_20,
                over_25,
                over_30,
                over_50,
                over_100
              }
            }
            gnomad_constraint{
            exp_lof,
            exp_mis,
            exp_syn,
            obs_lof,
            obs_mis,
            obs_syn,
            oe_lof,
            oe_lof_lower,
            oe_lof_upper,
            oe_mis,
            oe_mis_lower,
            oe_mis_upper,
            oe_syn,
            oe_syn_lower,
            oe_syn_upper,
            lof_z,
            mis_z,
            syn_z,
            pLI,
            flags
            }
            exac_constraint{
            exp_syn,
            exp_mis,
            exp_lof,
            obs_syn,
            obs_mis,
            obs_lof,
            mu_syn,
            mu_mis,
            mu_lof,
            syn_z,
            mis_z,
            lof_z,
            pLI
            }
        }
    }
    """

    query_for_variants = """
    {
        variant(%s: "%s", dataset: %s) {
        variantId
        reference_genome
        chrom
        pos
        ref
        alt
        colocatedVariants
        multiNucleotideVariants {
        combined_variant_id
        changes_amino_acids
        n_individuals
        other_constituent_snvs
        }
        exome {
        ac
        an
        ac_hemi
        ac_hom
        faf95 {
            popmax
            popmax_population
        }
        filters
        populations {
            id
            ac
            an
            ac_hemi
            ac_hom
        }
        age_distribution {
            het {
            bin_edges
            bin_freq
            n_smaller
            n_larger
            }
            hom {
            bin_edges
            bin_freq
            n_smaller
            n_larger
            }
        }
        qualityMetrics {
            alleleBalance {
            alt {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            }
            genotypeDepth {
            all {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            alt {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            }
            genotypeQuality {
            all {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            alt {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            }
        }
        }
        genome {
        ac
        an
        ac_hemi
        ac_hom
        faf95 {
            popmax
            popmax_population
        }
        filters
        populations {
            id
            ac
            an
            ac_hemi
            ac_hom
        }
        age_distribution {
            het {
            bin_edges
            bin_freq
            n_smaller
            n_larger
            }
            hom {
            bin_edges
            bin_freq
            n_smaller
            n_larger
            }
        }
        qualityMetrics {
            alleleBalance {
            alt {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            }
            genotypeDepth {
            all {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            alt {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            }
            genotypeQuality {
            all {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            alt {
                bin_edges
                bin_freq
                n_smaller
                n_larger
            }
            }
        }
        }
        flags
        rsid
        sortedTranscriptConsequences {
        canonical
        gene_id
        gene_version
        gene_symbol
        hgvs
        hgvsc
        hgvsp
        lof
        lof_flags
        lof_filter
        major_consequence
        polyphen_prediction
        sift_prediction
        transcript_id
        transcript_version
        }
        }
    
    }
    """

    query_for_genes = """
    {
        gene(%s: "%s", reference_genome: %s) {
                gene_id
            symbol
            start
            stop
            strand
            chrom
            hgnc_id
            gene_name
                symbol
            full_gene_name
                reference_genome
            omim_id
                canonical_transcript_id
            
            structural_variants(dataset: %s){
            ac,
            ac_hom,
            an,
            af,
            reference_genome,
            chrom,
            chrom2,
            end,
            end2,
            consequence,
            filters,
            length,
            pos,
            pos2,
            type,
            variant_id
            }
            
            variants(dataset: %s) {
            pos
            rsid
            ref
            alt
            consequence
            genome {
            genome_af:af
            genome_ac:ac
            genome_an:an
            genome_ac_hemi:ac_hemi
            genome_ac_hom:ac_hom
            }
            exome {
            exome_af:af
            exome_ac:ac
            exome_an:an
            exome_ac_hemi:ac_hemi
            exome_ac_hom:ac_hom
            }
            flags
            lof
            consequence_in_canonical_transcript
            gene_symbol
            hgvsc
            lof_filter
            lof_flags
            hgvsc
            hgvsp
            reference_genome
            variant_id: variantId
            }
                
            mane_select_transcript{
            ensembl_id
            ensembl_version
            refseq_id
            refseq_version
            }
            
            transcripts{
            reference_genome
            gene_id
            transcript_id
            strand
            start
            stop
            chrom
            }
            
            exac_regional_missense_constraint_regions {
            start
            stop
            obs_mis
            exp_mis
            obs_exp
            chisq_diff_null
            }
            
            clinvar_variants {
            variant_id
            clinvar_variation_id
            reference_genome
            chrom
            pos
            ref
            alt
            clinical_significance
            gold_stars
            major_consequence
            review_status
            }
            
            coverage(dataset: %s) {
                exome {
                pos
                mean
                median
                over_1
                over_5
                over_10
                over_15
                over_20
                over_25
                over_30
                over_50
                over_100
                }
                genome {
                pos
                mean
                median
                over_1
                over_5
                over_10
                over_15
                over_20
                over_25
                over_30
                over_50
                over_100
                }
            }
            
            
            gnomad_constraint {
            exp_lof
            exp_mis
            exp_syn
            obs_lof
            obs_mis
            obs_syn
            oe_lof
            oe_lof_lower
            oe_lof_upper
            oe_mis
            oe_mis_lower
            oe_mis_upper
            oe_syn
            oe_syn_lower
            oe_syn_upper
            lof_z
            mis_z
            syn_z
            pLI
            flags
            }
            
            exac_constraint {
            exp_syn
            exp_mis
            exp_lof
            obs_syn
            obs_mis
            obs_lof
            mu_syn
            mu_mis
            mu_lof
            syn_z
            mis_z
            lof_z
            pLI
            }
        }
    }
    """

    if filter_by == "transcript_id":
        query = query_for_transcripts % (search_term.upper(), reference_genome,
                                         dataset, dataset)

    elif filter_by == "rs_id":
        query = query_for_variants % ("rsid", search_term.lower(), dataset)

    elif filter_by == "gene_id":
        query = query_for_genes % ("gene_id", search_term.upper(),
                                   reference_genome, sv_dataset, dataset,
                                   dataset)

    elif filter_by == "gene_name":
        query = query_for_genes % ("gene_name", search_term.upper(),
                                   reference_genome, sv_dataset, dataset,
                                   dataset)

    else:
        print("Unknown `filter_by` type!")

    # Get repsonse
    global response
    response = requests.post(end_point, data={'query': query}, timeout=timeout)

    # Parse response
    if response.status_code == 200:

        st.markdown("---")
        st.subheader(
            "Outputs for `{}`  is being prepared.".format(search_term))
        st.markdown("\n")

        if filter_by == "transcript_id":
            if not os.path.exists('outputs/' + search_term + "/"):
                os.mkdir('outputs/' + search_term + "/")
            else:
                shutil.rmtree('outputs/' + search_term + "/")
                os.mkdir('outputs/' + search_term + "/")
            json_keys = list(response.json()["data"]["transcript"].keys())
            for json_key in json_keys:
                if response.json(
                )["data"]["transcript"][json_key] is not None and type(
                        response.json()["data"]["transcript"]
                    [json_key]) not in [str, int]:
                    data = json_normalize(
                        response.json()["data"]["transcript"][json_key])
                    data.columns = data.columns.map(lambda x: x.split(".")[-1])
                    data.to_csv("outputs/" + search_term + "/" + json_key +
                                ".tsv",
                                sep="\t",
                                index=False)
                    if (len(data) > 0) and (mode == "single"):
                        st.markdown("\n **Table for: `" + json_key + "`**")
                        st.dataframe(data)

        elif filter_by == "rs_id":
            if not os.path.exists('outputs/' + search_term + "/"):
                os.mkdir('outputs/' + search_term + "/")
            else:
                shutil.rmtree('outputs/' + search_term + "/")
                os.mkdir('outputs/' + search_term + "/")
            json_keys = list(response.json()["data"]["variant"].keys())

            general_info = "```"
            for json_key in json_keys:
                # print(json_key, type(response.json()["data"]["variant"][json_key]))

                # Basic info in `variant` part
                if response.json(
                )["data"]["variant"][json_key] is not None and type(
                        response.json()["data"]["variant"][json_key]) in [
                            str, int
                        ]:
                    with open(
                            "outputs/" + search_term + "/" + search_term +
                            ".txt", "a") as f:
                        f.write(
                            "\n" + json_key + ":" +
                            str(response.json()["data"]["variant"][json_key]))
                        general_info += "\n" + json_key + ":" + str(
                            response.json()["data"]["variant"][json_key])
                # Other parts rather than `genome` and `exome`
                if response.json(
                )["data"]["variant"][json_key] is not None and type(
                        response.json()["data"]["variant"][json_key]) not in [
                            str, int
                        ] and json_key not in ["genome", "exome"]:
                    data = json_normalize(
                        response.json()["data"]["variant"][json_key])
                    data.columns = data.columns.map(lambda x: x.split(".")[-1])
                    data.to_csv("outputs/" + search_term + "/" + json_key +
                                ".tsv",
                                sep="\t",
                                index=False)
                    if (len(data) > 0) and (mode == "single"):
                        st.markdown("\n **Table for: `" + json_key + "`**")
                        st.dataframe(data)

                # Deep parsing for nested things in `genome` and `exome`
                if json_key in ["genome", "exome"]:
                    for sub_json_key in list(response.json()["data"]["variant"]
                                             [json_key].keys()):
                        # print(json_key, sub_json_key, type(response.json()["data"]["variant"][json_key][sub_json_key]))

                        if response.json()["data"]["variant"][json_key][
                                sub_json_key] is not None and type(
                                    response.json()["data"]["variant"]
                                    [json_key][sub_json_key]) in [str, int]:
                            with open(
                                    "outputs/" + search_term + "/" +
                                    search_term + ".txt", "a") as f:
                                f.write("\n" + json_key + "_" + sub_json_key +
                                        ":" +
                                        str(response.json()["data"]["variant"]
                                            [json_key][sub_json_key]))
                                general_info += "\n" + json_key + "_" + sub_json_key + ":" + str(
                                    response.json()["data"]["variant"]
                                    [json_key][sub_json_key])

                        if response.json()["data"]["variant"][json_key][
                                sub_json_key] is not None and type(
                                    response.json()["data"]["variant"]
                                    [json_key][sub_json_key]) not in [
                                        str, int
                                    ]:
                            data = json_normalize(
                                response.json()["data"]["variant"][json_key]
                                [sub_json_key])
                            data.columns = data.columns.map(
                                lambda x: x.split(".")[-1])
                            data.to_csv("outputs/" + search_term + "/" +
                                        json_key + "_" + sub_json_key + ".tsv",
                                        sep="\t",
                                        index=False)
                            if (len(data) > 0) and (mode == "single"):
                                st.markdown("\n **Table for: `" +
                                            sub_json_key + "`**")
                                st.dataframe(data)

            general_info += "```"
            if mode == "single":
                st.markdown("--- \n **General Info for your query**")
                st.info(general_info)

        elif filter_by == "gene_id":
            if not os.path.exists('outputs/' + search_term + "/"):
                os.mkdir('outputs/' + search_term + "/")
            else:
                shutil.rmtree('outputs/' + search_term + "/")
                os.mkdir('outputs/' + search_term + "/")

            json_keys = list(response.json()["data"]["gene"].keys())
            general_info = "```"
            for json_key in json_keys:
                # print(json_key, type(response.json()["data"]["gene"][json_key]), response.json()["data"]["gene"][json_key] is None, type(response.json()["data"]["gene"][json_key]) not in [str, int])
                if response.json(
                )["data"]["gene"][json_key] is not None and type(
                        response.json()["data"]["gene"][json_key]) in [
                            str, int
                        ]:
                    with open(
                            "outputs/" + search_term + "/" + search_term +
                            ".txt", "a") as f:
                        f.write("\n" + json_key + ":" +
                                str(response.json()["data"]["gene"][json_key]))
                        general_info += "\n" + json_key + ":" + str(
                            response.json()["data"]["gene"][json_key])

                if response.json(
                )["data"]["gene"][json_key] is not None and type(
                        response.json()["data"]["gene"][json_key]) not in [
                            str, int
                        ]:
                    data = json_normalize(
                        response.json()["data"]["gene"][json_key])
                    data.columns = data.columns.map(lambda x: x.split(".")[-1])
                    data.to_csv("outputs/" + search_term + "/" + json_key +
                                ".tsv",
                                sep="\t",
                                index=False)
                    if (len(data) > 0) and (mode == "single"):
                        st.markdown("\n **Table for: `" + json_key + "`**")
                        st.dataframe(data)

            general_info += "```"
            if mode == "single":
                st.markdown("--- \n **General Info for your query**")
                st.info(general_info)

        elif filter_by == "gene_name":
            if not os.path.exists('outputs/' + search_term + "/"):
                os.mkdir('outputs/' + search_term + "/")
            else:
                shutil.rmtree('outputs/' + search_term + "/")
                os.mkdir('outputs/' + search_term + "/")

            json_keys = list(response.json()["data"]["gene"].keys())
            general_info = "```"
            for json_key in json_keys:
                # print(json_key, type(response.json()["data"]["gene"][json_key]), response.json()["data"]["gene"][json_key] is None, type(response.json()["data"]["gene"][json_key]) not in [str, int])
                if response.json(
                )["data"]["gene"][json_key] is not None and type(
                        response.json()["data"]["gene"][json_key]) in [
                            str, int
                        ]:
                    with open(
                            "outputs/" + search_term + "/" + search_term +
                            ".txt", "a") as f:
                        f.write("\n" + json_key + ": " +
                                str(response.json()["data"]["gene"][json_key]))
                    general_info += (
                        "\n" + json_key + ": " +
                        str(response.json()["data"]["gene"][json_key]))

                if response.json(
                )["data"]["gene"][json_key] is not None and type(
                        response.json()["data"]["gene"][json_key]) not in [
                            str, int
                        ]:
                    data = json_normalize(
                        response.json()["data"]["gene"][json_key])
                    data.columns = data.columns.map(lambda x: x.split(".")[-1])
                    data.to_csv("outputs/" + search_term + "/" + json_key +
                                ".tsv",
                                sep="\t",
                                index=False)
                    if (len(data) > 0) and (mode == "single"):
                        st.markdown("\n **Table for: `" + json_key + "`**")
                        st.dataframe(data)

            general_info += "```"
            if mode == "single":
                st.markdown("--- \n **General Info for your query**")
                st.info(general_info)

    return response
Example #50
0
File: models.py Project: soooh/cpi
 def to_dataframe(self):
     """
     Returns the list as a pandas DataFrame.
     """
     dict_list = [obj.__dict__() for obj in self]
     return json_normalize(dict_list, sep="_")
VERSION = '20180604'
LIMIT = 30

# In[12]:

radius = 700
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, VERSION, latitude_n1, longitude_n1, radius,
    LIMIT)
results = requests.get(url).json()

# In[13]:

venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)
nearby_venues.columns

# In[14]:


def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']

    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
Example #52
0
darksky_api = keys["darksky_api"][0]

# location
google = "https://maps.googleapis.com/maps/api/geocode/json?address=Cape Town&key=" + google_api
resp_loc = requests.get(google)
lat = json.loads(resp_loc.content)["results"][0]["geometry"]["location"]["lat"]
lon = json.loads(resp_loc.content)["results"][0]["geometry"]["location"]["lng"]

# weather
dates = [str(int((dt.datetime.now(pytz.utc) - dt.timedelta(days=x)).timestamp())) for x in range(365)]
weather_df = pd.DataFrame()
i = 0
for d in dates:
    print(str(i) + ": " + str(d))
    darksky = "https://api.darksky.net/forecast/" + darksky_api + "/" + str(lat) + "," + str(lon) + "," + d + "?exclude=hourly,alerts,flags"
    weather_df = weather_df.append(json_normalize(json.loads(requests.get(darksky).content)["currently"]))
    i += 1

# weather_df.to_pickle("/Users/phil/vscode/weather_lambda/model_training/data/weather_df_raw.pkl")
weather_df = pd.read_pickle("/Users/phil/vscode/weather_lambda/model_training/data/weather_df_raw.pkl")

# datetime and order
weather_df = weather_df.reset_index()
weather_df["datetime"] = [dt.datetime.fromtimestamp(weather_df["time"][i]) for i in range(0, len(weather_df))]
weather_df = weather_df.sort_values(by=["datetime"])

# take a look
weather_df.dtypes
weather_df["precipIntensity"]
weather_df["precipProbability"]
weather_df["precipType"].unique()
Example #53
0
# In[7]:


# Send the GET Request and examine the results
results = requests.get(url).json()
#results


# In[9]:


# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()


# 
# Clean university Dataframe

# In[10]:


# keep only columns that include venue name, and anything that is associated with location
clean_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')]+ ['id']
clean_dataframe = dataframe.loc[:,clean_columns]

# function that extracts the category of the venue
def get_category_type(row):
Example #54
0
File: models.py Project: soooh/cpi
 def to_dataframe(self):
     """
     Returns this series and all its indexes as a pandas DataFrame.
     """
     dict_list = [obj.__dict__() for obj in self.indexes]
     return json_normalize(dict_list, sep="_")
Example #55
0
def mine_data(file_name=None,
              first_match_id=first_match,
              last_match_id=last_match,
              stop_at=None,
              timeout=15,
              save_every=1000):
    """ Mine data using the official Opendota API. Keep requests at a decent rate (3/s).
    For every request, a JSON containing 100 games is returned. The games are downloaded
    in descending order of the match IDs.

    Args:
        file_name: the name of the file where the dataframe will be stored
        first_match_id: lowest match ID to look at; currently set at the start of 7.06e
        last_match_id: highest match ID to look at; currently start at the end of 7.06e
        stop_at: when the dataframe contains stop_at games, the mining stops
        timeout: in case Opendota does not respond, wait timeout seconds before retrying
        save_every: save the dataframe every save_every entries

    Returns:
        dataframe with the mined games
    """
    global OPENDOTA_URL
    global REQUEST_TIMEOUT
    global COLUMNS
    global logger

    results_dataframe = pd.DataFrame()
    current_chunk = 1
    current_match_id = last_match_id
    games_remaining = stop_at

    while current_match_id > first_match_id:
        try:
            current_link = OPENDOTA_URL + str(current_match_id)
            logger.info("Mining chunk starting at match ID %d", current_match_id)
            response = urllib2.urlopen(current_link, timeout=timeout)
        except (urllib2.URLError, ssl.SSLError) as error:
            logger.error("Failed to make a request starting at match ID %d", current_match_id)
            logger.info("Waiting %d seconds before retrying", timeout)
            time.sleep(timeout)
            current_match_id -= 1
            continue

        try:
            response_json = json.load(response)
            last_match_id = response_json[-1]['match_id']
        except (ValueError, KeyError) as error:
            logger.error("Corrupt JSON starting at match ID %d, skipping it", current_match_id)
            current_match_id -= 1
            continue

        current_match_id = last_match_id

        if games_remaining:
            games_remaining -= len(response_json)

        current_dataframe = json_normalize(response_json)

        if len(current_dataframe) == 0:
            logger.info("Found an empty dataframe, skipping 10 games")
            current_match_id -= 10
            continue

        results_dataframe = results_dataframe.append(current_dataframe, ignore_index=True)

        if len(results_dataframe) > current_chunk * save_every:
            current_chunk += 1

            if file_name:
                pd.DataFrame(results_dataframe, columns=COLUMNS).to_csv(file_name, index=False)
                logger.info("Saving to csv. Total of games mined: %d", len(results_dataframe))

                if stop_at:
                    if len(results_dataframe) > stop_at:
                        return results_dataframe

        if stop_at:
            if len(results_dataframe) > stop_at:
                break

        time.sleep(REQUEST_TIMEOUT)

    return results_dataframe
Example #56
0
from configparser import ConfigParser

from bson import json_util
from pandas.io.json import json_normalize
from pymongo import MongoClient

config_parser = ConfigParser()
config_parser.read('../config/reader-config.ini', encoding='utf-8')

mongo_client = MongoClient(config_parser.get('mongo', 'host'), int(config_parser.get('mongo', 'port')))
tweets = mongo_client[config_parser.get('mongo', 'db')][config_parser.get('mongo', 'collection')]
data = tweets.find({})

# load MongoDB data as JSON data and flatten using json_normalize
sanitized = json.loads(json_util.dumps(data))

# replace new line with space
for i, j in enumerate(sanitized):
    j['text'] = '"' + j['text'].strip().replace("\n", " ") + '"'

normalized = json_normalize(sanitized)
normalized.to_csv(
    path_or_buf="../../../data-set/data-set.csv",
    columns=[column.strip() for column in config_parser.get('csv', 'columns').split(',')],
    encoding="utf-8",
    index_label="instance_id"
)

for i, j in enumerate(sanitized):
    print(i, j['id'], j['user']['screen_name'], j['truncated'], j['text'])
Example #57
0
def create_pull_requests_comments_df(owner, repo, api):
    pull_requests_comments_list = pull_requests_comments_of_repo_github(
        owner, repo, api)
    return json_normalize(pull_requests_comments_list)
def getdata(url):
	res = requests.get(url)
	data = res.json()

	df_api = json_normalize(data)
	return df_api
Example #59
0
    def define_obejct(self):
        # Getting values using key on dictionary format
        def extract_items(dict_data, keys):
            new_dict = {}
            for i in dict_data.keys():
                if i in keys:
                    new_dict[i] = dict_data[i]
            return new_dict

        defined_obejct_df = pd.DataFrame()
        try:
            with open(self.path, encoding='UTF8') as f:
                # 1. load .Json
                data = json.load(f)
                # 2. Extract data (What will we use?)
                keys = ['count', 'edges']
                j1 = extract_items(data, keys)

                # 3. Normalize Json format
                j2 = json_normalize(j1['edges'])
                # 4. Drop columns that don't be needed and then rename columns
                original = set(j2.columns)
                fixed = {
                    'node.display_url', 'node.edge_liked_by.count',
                    'node.edge_media_to_caption.edges',
                    'node.edge_media_to_comment.count', 'node.id',
                    'node.owner.id', 'node.taken_at_timestamp'
                }
                j2.drop(list(original - fixed), axis=1, inplace=True)

                # Clear sets that don't use anymore
                original.clear()
                fixed.clear()

                # rename columns
                j2.columns = [
                    'contents_url', 'like_count', 'post', 'comment_count',
                    'post_id', 'user_id', 'timeStamp'
                ]

                # 5. Extracting tags form text (use regex)
                tag_list = []
                for row in j2['post']:
                    if not bool(row):
                        tag_list.append("null+nan+none")
                    else:
                        re_row = row[0]['node']['text']
                        if re_row.find("#") > -1:
                            p = re.compile('#([^#\s]+)')
                            tag_from_row = p.findall(re_row)
                            tag_from_row = [x.strip('#') for x in tag_from_row]
                            row = ' '.join(tag_from_row)
                            tag_list.append(row)
                        else:
                            tag_list.append(None)
                # 6. Create new columns and insert tag list into created columns
                j2['extracted_tags'] = tag_list
                defined_obejct_df = defined_obejct_df.append(j2,
                                                             ignore_index=True)
            print("행 {}, 열 {}로 구성된 데이터 프레임입니다.".format(
                defined_obejct_df.shape[0], defined_obejct_df.shape[1]))

        except:
            print("정규화할 데이터가 없네요! 확인하세요.")
            print("경로: {}".format(self.path))

        finally:
            return defined_obejct_df
parser = argparse.ArgumentParser(description='csg conversion tool.')

parser.add_argument('-i', '--input', help='Filename to read in', required=True)

parser.add_argument('-x',
                    '--x_axis',
                    help='Material to plot of x axis',
                    required=True)

args = parser.parse_args()

filename = args.input
with open(os.path.join(filename)) as f:
    results = json.load(f)

results_df = json_normalize(data=results)
df_filtered_by_mat = results_df

# df_filtered_by_mat = results_df[results_df['fw_material']=='eurofer'&&results_df['armour_material']=='tungsten']

x = list(df_filtered_by_mat['fw_thickness'])
y = list(df_filtered_by_mat['armour_thickness'])
z = list(df_filtered_by_mat['leakage_neutron_current.value'])
z_e = list(df_filtered_by_mat['leakage_neutron_current.std_dev'])
labels = [str(i) + '+-' + str(j) for i, j in zip(z, z_e)]

if len(x) < 40:

    coords = list(zip(x, y))

    GP = GpRegressor(coords, z, y_err=z_e)