def gonzales(data , k): #transform the data numpy array to data frame using the id as index points_list = DataFrame(data[:, 1:] , index = data[ : , 0]) #adding two columns in the points data frame for saving the centers and distance points_list["distance"] = np.nan points_list["center"] = np.nan distance_column_index = points_list.columns.get_loc("distance") #choosing a random point as the first center #center0 = points_list.sample(n=1 , random_state = randint(0,100) , axis=0) center0 = points_list.head(1) centers_list = DataFrame(center0.drop(['distance' , 'center'] , axis = 1)) centers_list['color'] = 'r' colors = "bgcmykw" #=========================================================================== # print(centers_list) # print("==============Initialization finished===========") #=========================================================================== #looping k-1 time to have k centers for k_cycle in range(1,k+1): # varibles to save the next center to be chosen based on the maximum distance a point makes within its cluster max_distance = 0 next_cluster = np.nan #loop on all the points to assign them to their closest center for indexp, p in points_list.iterrows(): #variables to save the choose the closest center min_cluster_distance = math.inf closest_cluster = None for indexc, center in centers_list.iterrows(): dis = spatial.distance.euclidean(center.as_matrix(columns=[0 ,1]) , p.as_matrix(columns=[0 ,1])) if dis < min_cluster_distance: min_cluster_distance = dis closest_cluster = indexc p["distance"] = min_cluster_distance p["center"] = closest_cluster if min_cluster_distance > max_distance: max_distance = min_cluster_distance next_cluster = indexp centers_list = centers_list.append(points_list.ix[[next_cluster], :distance_column_index ]) centers_list.set_value(next_cluster, 'color', colors[k_cycle]) #======================================================================= # print(centers_list) # print("==============Cycle finished===========") #======================================================================= centers_list.drop(centers_list.tail(1).index, inplace=True) centers_list.drop(['color'], axis=1 ,inplace=True) #=========================================================================== # centers_list.plot(kind='scatter', x=0, y=1 , c='r' ) # points_list.plot(kind='scatter', x=0, y=1 , c='center' , s= points_list['center'] *2 ) # plt.show() #=========================================================================== #print(points_list) return centers_list.as_matrix(columns=[0 ,1])
def test_setitem_cache_updating(self): # GH 5424 cont = ['one', 'two', 'three', 'four', 'five', 'six', 'seven'] for do_ref in [False, False]: df = DataFrame({'a': cont, "b": cont[3:] + cont[:3], 'c': np.arange(7)}) # ref the cache if do_ref: df.loc[0, "c"] # set it df.loc[7, 'c'] = 1 assert df.loc[0, 'c'] == 0.0 assert df.loc[7, 'c'] == 1.0 # GH 7084 # not updating cache on series setting with slices expected = DataFrame({'A': [600, 600, 600]}, index=date_range('5/7/2014', '5/9/2014')) out = DataFrame({'A': [0, 0, 0]}, index=date_range('5/7/2014', '5/9/2014')) df = DataFrame({'C': ['A', 'A', 'A'], 'D': [100, 200, 300]}) # loop through df to update out six = Timestamp('5/7/2014') eix = Timestamp('5/9/2014') for ix, row in df.iterrows(): out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D'] tm.assert_frame_equal(out, expected) tm.assert_series_equal(out['A'], expected['A']) # try via a chain indexing # this actually works out = DataFrame({'A': [0, 0, 0]}, index=date_range('5/7/2014', '5/9/2014')) for ix, row in df.iterrows(): v = out[row['C']][six:eix] + row['D'] out[row['C']][six:eix] = v tm.assert_frame_equal(out, expected) tm.assert_series_equal(out['A'], expected['A']) out = DataFrame({'A': [0, 0, 0]}, index=date_range('5/7/2014', '5/9/2014')) for ix, row in df.iterrows(): out.loc[six:eix, row['C']] += row['D'] tm.assert_frame_equal(out, expected) tm.assert_series_equal(out['A'], expected['A'])
def compute_tf_idf_queries(self): # Find total number of document results = self.cursor.execute('SELECT seq FROM sqlite_sequence WHERE name=\'{}\''.format('documents')) tmp = results.fetchone() total_doc = tmp[0] results = self.cursor.execute('SELECT did, total_word, path FROM documents') tmp = results.fetchall() documents_df = DataFrame(tmp, columns=['did', 'total_word', 'path']) documents_df['tf_idf'] = 0.0 no_docterm = {} for query in self.queries: no_docterm[query] = 0 for index, row in documents_df.iterrows(): path = row['path'] with codecs.open(path, 'rt') as f: text = f.read() for query in self.queries: if query in text.decode('utf-8').lower(): no_docterm[query] += 1 for query in self.queries: for index, row in documents_df.iterrows(): total_word = row['total_word'] path = row['path'] with codecs.open(path, 'rt') as f: text = f.read() tf_idf = self._compute_tf_idf_queries(text, total_word, total_doc, no_docterm[query]) cur_tf_idf = documents_df.get_value(index, 'tf_idf') documents_df.set_value(index, 'tf_idf', cur_tf_idf + tf_idf) results = self.cursor.execute('SELECT did, type, entity FROM entities') tmp = results.fetchall() df = DataFrame(tmp, columns=['did', 'e_type', 'entity']) df['tf_idf'] = 0.0 for index, row in df.iterrows(): did = row['did'] tf_idf = documents_df[documents_df['did'] == did]['tf_idf'].values[0] df.set_value(index, 'tf_idf', tf_idf) del df['did'] df = df.groupby(['e_type', 'entity']).sum().reset_index() return df
def create_seated( settings: dict, passengers: pd.DataFrame) -> pd.DataFrame: """ :param settings: Configuration settings for the current trial :param passengers: The passengers data frame for the trial """ passenger_index = [] seat_names = [] seated_time = [] for index, passenger in passengers.iterrows(): passenger_index.append(index) seat_names.append( '{}{}'.format(passenger['aisle'], passenger['letter']) ) seated_time.append(None) return pd.DataFrame({ 'passenger': passenger_index, 'seat': seat_names, 'time': seated_time })
def receiver_locations(locs: pandas.DataFrame): if not isinstance(locs, pandas.DataFrame): return if cartopy is not None: ax = figure().gca(projection=cartopy.crs.PlateCarree()) ax.add_feature(cpf.LAND) ax.add_feature(cpf.OCEAN) ax.add_feature(cpf.COASTLINE) ax.add_feature(cpf.BORDERS, linestyle=':') else: ax = figure().gca() for name, loc in locs.iterrows(): if 15 <= loc.interval < 30: c = 'g' elif 5 <= loc.interval < 15: c = 'o' elif loc.interval < 5: c = 'r' else: # large or undefined interval c = 'b' if np.isfinite(loc.interval): ax.scatter(loc.lon, loc.lat, s=1000*1/loc.interval, c=c, label=name) else: ax.scatter(loc.lon, loc.lat, c=c, label=name)
def predict(self, prediction_data): df = DataFrame(prediction_data) ret = [] for row in df.iterrows(): index, data = row ret += [self.agg(data.tolist())] return(ret)
def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({'a': ['R1', 'R2', nan, 'R4'], 'b': ['C1', 'C2', 'C3', 'C4'], 'c': [10, 15, 17, 20]}) result = df.pivot('a', 'b', 'c') expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]], index=Index([nan, 'R1', 'R2', 'R4'], name='a'), columns=Index(['C1', 'C2', 'C3', 'C4'], name='b')) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T) # GH9491 df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'), 'c': 100 + np.arange(6)}) df['b'] = df['a'] - pd.Timestamp('2014-02-02') df.loc[1, 'a'] = df.loc[3, 'a'] = nan df.loc[1, 'b'] = df.loc[4, 'b'] = nan pv = df.pivot('a', 'b', 'c') self.assertEqual(pv.notnull().values.sum(), len(df)) for _, row in df.iterrows(): self.assertEqual(pv.loc[row['a'], row['b']], row['c']) tm.assert_frame_equal(df.pivot('b', 'a', 'c'), pv.T)
def sum_of_parts(self): ''' For more info on this see: https://github.com/unicef/rhizome/blob/master/docs/spec.rst#aggregation-and-calculation ''' ## get the indicator_ids we need to make the calculation ## initial_calc_df = self.build_calc_df(['PART_TO_BE_SUMMED']) ## handle recursive calculations ( see spec.rst link above ) ## calc_df = self.build_recursive_sum_calc_df(initial_calc_df) self_join_calc_df = calc_df.merge(calc_df, left_on =\ 'indicator_component_id',right_on='calc_indicator_id',how='left') ## get the datapoints for the above indicator_ids ## dp_df = self.build_dp_df(calc_df['indicator_component_id']) ## now join the above dataframe on itself to set up the calculation ## dp_df_with_calc = self.join_dp_to_calc(calc_df, dp_df) ## take the sum of all of the component indicators ## grouped_df = DataFrame(dp_df_with_calc.merge(dp_df_with_calc)\ .groupby(['location_id','calc_indicator_id','campaign_id',])\ ['value'].sum()) for ix, row_data in grouped_df.iterrows(): self.dwc_tuple_dict[ix] = row_data.value
def create_unified_column(data_frame: pd.DataFrame) -> pd.Series: unified = [ '-'.join(to_strings(row.to_dict().values())) for _, row in data_frame.iterrows() ] return pd.Series(unified)
def resolve(dataset, m): t = dataset.y phis = DataFrame() for i in range(0,m+1): p = dataset.x**i p.name="x**%d" % i phis = pd.concat([phis,p], axis=1) for index, line in phis.iterrows(): phi = DataFrame(line) if index == 0: phiphi = np.dot(phi,phi.T) else: phiphi += np.dot(phi,phi.T) s_inv = alpha * DataFrame(np.identity(m+1)) + beta * phiphi s = np.linalg.inv(s_inv) # 平均 m(x) def mean_fun(x0): phi_x0 = DataFrame([x0 ** i for i in range(0,m+1)]) for index, line in phis.iterrows(): if index == 0: tmp = t[index] * line else: tmp += t[index] * line return (beta * np.dot(np.dot(phi_x0.T, s), DataFrame(tmp))).flatten() # 標準偏差 s(x) def deviation_fun(x0): phi_x0 = DataFrame([x0 ** i for i in range(0,m+1)]) deviation = np.sqrt(1.0/beta + np.dot(np.dot(phi_x0.T, s), phi_x0)) return deviation.diagonal() return mean_fun, deviation_fun
def test_age(df: DataFrame): sub = 0 for index, row in df.iterrows(): name = row['Name'] age = row['Age'] if not math.isnan(age): if age <= 8: res = 'kid' elif age <= 30: res = 'young' elif age <= 45: res = 'middle' else: res = 'old' else: if match_name(name, r".*Master\..*"): res = 'kid' elif match_name(name, r".*Miss\..*"): res = 'young' elif match_name(name, r".*Mr(s)?\..*"): res = 'middle' else: res = 'young' df.loc[sub, 'Age'] = res sub += 1 return df
class Iteration(object): def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=['C' + str(c) for c in range(N * 5)]) def time_iteritems(self): # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() for name, col in self.df.iteritems(): pass def time_iteritems_cached(self): for name, col in self.df.iteritems(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass def time_iterrows(self): for row in self.df.iterrows(): pass
def predict(self, prediction_data): preds = DataFrame(prediction_data) ret = [] for row in preds.iterrows(): index, data = row ret.append(mean(data)) return(ret)
def make_lines(tracks: pd.DataFrame, transformation: dict) -> list: def make_line(start: pd.Series, end: pd.Series) -> str: return create_tag('line', { 'stroke': 'rgba(0, 0, 0, 0.2)', 'stroke-width': '2', 'stroke-dasharray': '5,5', 'x1': transform_x(start['x'], transformation), 'y1': transform_y(start['y'], transformation), 'x2': transform_x(end['x'], transformation), 'y2': transform_y(end['y'], transformation) }) previous = pd.Series(dict( x=transformation['x_min'], y=tracks.iloc[0]['y'] )) lines = [] for index, row in tracks.iterrows(): lines.append(make_line(previous, row)) previous = row if previous['x'] < transformation['x_max']: lines.append(make_line(previous, pd.Series(dict( x=transformation['x_max'], y=previous['y'] )))) return lines
def test_pivot_index_with_nan(self): # GH 3588 nan = np.nan df = DataFrame({"a": ["R1", "R2", nan, "R4"], "b": ["C1", "C2", "C3", "C4"], "c": [10, 15, 17, 20]}) result = df.pivot("a", "b", "c") expected = DataFrame( [[nan, nan, 17, nan], [10, nan, nan, nan], [nan, 15, nan, nan], [nan, nan, nan, 20]], index=Index([nan, "R1", "R2", "R4"], name="a"), columns=Index(["C1", "C2", "C3", "C4"], name="b"), ) tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df.pivot("b", "a", "c"), expected.T) # GH9491 df = DataFrame({"a": pd.date_range("2014-02-01", periods=6, freq="D"), "c": 100 + np.arange(6)}) df["b"] = df["a"] - pd.Timestamp("2014-02-02") df.loc[1, "a"] = df.loc[3, "a"] = nan df.loc[1, "b"] = df.loc[4, "b"] = nan pv = df.pivot("a", "b", "c") self.assertEqual(pv.notnull().values.sum(), len(df)) for _, row in df.iterrows(): self.assertEqual(pv.loc[row["a"], row["b"]], row["c"]) tm.assert_frame_equal(df.pivot("b", "a", "c"), pv.T)
def calculate(settings: dict, progress: pd.DataFrame): """ :param settings: :param progress: :return: """ passenger_count = settings['passenger_count'] waiting = [] previous_row = None for elapsed_time, row in progress.iterrows(): waiting.append(0) for passenger_index in range(passenger_count): if previous_row is None: continue position = row[str(passenger_index)] last_position = previous_row[str(passenger_index)] if position == last_position: waiting[-1] += 1 previous_row = row waiting[-1] = 100.0 * waiting[-1] / passenger_count
def convert2flightplan(df: pd.DataFrame): """ Convert dataframe to Flight Gear Flight Plan. """ # Feet above sea level df['fasl'] = df['masl'] * 3.28084 # Knots are nm (1852 m) per hour. df['knots'] = (df['dm'] / 1852) / (df['dt_sec'] / 60 / 60) rv = HEADER prev_state = STOP # Take off at Vr (stop ignoring altitude). df['state'] = df.apply(lambda x: flight_state(x['knots']), axis=1) for index, row in df.iterrows(): state = row['state'] if state in (TAXI, RUNWAY, AERO,): # Only output when really moving. rv += WPT.format(**row, ground='true' if state in (STOP, TAXI, RUNWAY,) else 'false') if state == TAXI and prev_state == STOP: # Landed new flight. rv += FOOTER + HEADER + "<!-- -lat={lat} -lon={lon} -->".format(**row) prev_state = state rv += FOOTER return rv
def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True, workers=1, ignore_globs=None, include_globs=None): """ Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines blamed to each committer at each timestamp as data. :param branch: (optional, default 'master') the branch to work in :param limit: (optional, default None), the maximum number of revisions to return, None for no limit :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping. :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used :param committer: (optional, defualt=True) true if committer should be reported, false if author :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything. :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core. :return: DataFrame """ if not _has_joblib: raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use cumulative_blame() instead.''') revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints) if self.verbose: print('Beginning processing for cumulative blame:') revisions = json.loads(revs.to_json(orient='index')) revisions = [revisions[key] for key in revisions] ds = Parallel(n_jobs=workers, backend='threading', verbose=5)( delayed(_parallel_cumulative_blame_func) (self, x, committer, ignore_globs, include_globs) for x in revisions ) revs = DataFrame(ds) del revs['rev'] revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp)) revs.set_index(keys=['date'], drop=True, inplace=True) revs = revs.fillna(0.0) # drop 0 cols for col in revs.columns.values: if col != 'col': if revs[col].sum() == 0: del revs[col] # drop 0 rows keep_idx = [] committers = [x for x in revs.columns.values if x != 'date'] for idx, row in revs.iterrows(): if sum([row[x] for x in committers]) > 0: keep_idx.append(idx) revs = revs.ix[keep_idx] revs.sort_index(ascending=False, inplace=True) return revs
def write_dialer(filepath: str, dialer: pd.DataFrame): """ Write to fixed width dialer format - expect each column to be left justified data frame with no need for character padding line endings are carriage returns in windows - FIX?? """ with open(filepath, 'w') as f: for i, row in dialer.iterrows(): f.write(''.join(row.tolist()) + "\n")
def set_cabin(df: DataFrame): for index, row in df.iterrows(): # is nan if isinstance(row['Cabin'], float): df.loc[index, ['Cabin']] = 'X' else: df.loc[index, ['Cabin']] = row['Cabin'][0] df['Cabin'] = df['Cabin'].astype('object') return df
def fillna_dict(cls, prop): """ Use trade history then fill empty with value row above """ df = DataFrame(prop) df = df.replace(['', 'DEBIT', 'CREDIT'], numpy.nan) df = df.fillna(method='ffill') return [r.to_dict() for k, r in df.iterrows()]
def save_to_file(self, fn): gg = DataFrame(self.power_series_apps_table) try: del gg['diff1'] del gg['diff2'] except Exception: print('') gg['Loc Events'] = self.loc.events_apps_1min['Apps'] apps = self.loc.metadata.get_channels() sd = {} #Initialize series with 0s for app in apps: sd[app] = Series(0, index=gg.index) #Count location events for each appliance for index, row in gg.iterrows(): try: if len(row['Loc Events']) > 0: for app in apps: n = row['Loc Events'].count(app) sd[app][index] = n except Exception: continue if self.loc.name == 'REDD': sd[(3,4)] = sd[3] sd[(10,20)] = sd[10] del sd[3] del sd[4] del sd[10] del sd[20] #Change column names and append them to gral table locevents = DataFrame(sd) locevents.columns = [(str(col) + ' locEv') for col in locevents] for locEv in locevents: gg[locEv] = locevents[locEv] #Get power values of each appliance and resample for 1min act = DataFrame(self.loc.appliances_consuming_times) act = act.resample('1Min') if self.loc.name == 'REDD': del act[3] del act[10] act.columns = [(3,4), 5,6,7,8,9,11,12,13,14,15,16,17,18,19,(10,20)] act.columns = [(str(col) + ' conEv') for col in act] for app in act: gg[app] = act[app] gg.columns = [str(col) for col in gg] gg = gg[sorted(gg.columns)] gg.to_csv(fn) return
def make_circles(tracks: pd.DataFrame, transformation: dict) -> list: def make_circle(track: pd.Series): return create_tag('circle', { 'r': 16, 'cx': transform_x(track['x'], transformation), 'cy': transform_y(track['y'], transformation), 'style': 'fill:{}'.format(get_color(track, tracks)) }) return [make_circle(row) for index, row in tracks.iterrows()]
def test_iterrows_corner(self): # gh-12222 df = DataFrame( {'a': [datetime.datetime(2015, 1, 1)], 'b': [None], 'c': [None], 'd': [''], 'e': [[]], 'f': [set()], 'g': [{}]}) expected = Series( [datetime.datetime(2015, 1, 1), None, None, '', [], set(), {}], index=list('abcdefg'), name=0, dtype='object') _, result = next(df.iterrows()) tm.assert_series_equal(result, expected)
def word_freq(file_name, suffix='_wordfreq', sep='\t', threshold=.5): print "start word_freq" # start = datetime.datetime.now() # print start reviews = pd.read_csv(file_name, error_bad_lines=False, sep=sep) cb = reviews['stopword_body'] rate = reviews['Rating'] # label all words with the rating cb_temp = [] for i, c in enumerate(cb): cb_temp.append([(w, rate[i]) for w in ast.literal_eval(c)]) reviews['stopword_body'] = cb_temp # calculate_time(start) # get the corpus of all reviews, lists of all words with label '''--------------------------------------------------------''' cop_wl = [] for b in cb_temp: # change the unicode data to the raw string # cop_wl += [(unicodedata.normalize('NFKD', w[0]).encode('utf-8','replace'), w[1]) for w in b if type(w[0])==unicode] cop_wl += b '''--------------------------------------------------------''' # calculate_time(start) # word frequency of the corpus with label wfq = nltk.FreqDist(cop_wl) # calculate_time(start) # get the word list of all reviews without label cop = [w[0] for w in cop_wl] cop = set(cop) cop_len = len(cop) # calculate_time(start) # get freq of all words in one list wfq_l = [] for w in cop: for i in range(1, 6): wfq_l.append(wfq[(w, i)]) # calculate_time(start) # reshape the list to a matrix wfq_mx = DataFrame(np.array(wfq_l).reshape((cop_len,5)), index=pd.Index(cop), columns=pd.Index([1,2,3,4,5])) # calculate_time(start) # calculate the prob of each rating w_s = [] w_sum = [] for i, r in wfq_mx.iterrows(): word_sum = wfq_mx.ix[i].sum() # wfq_mx.ix[i] = wfq_mx.ix[i]/word_sum w_s.append(word_useful_score(list(wfq_mx.ix[i]), word_sum)) w_sum.append(word_sum) wfq_mx['score'] = w_s wfq_mx['sum'] = w_sum wfq_mx = wfq_mx.sort(columns='sum').ix[-int(len(w_s) * threshold):,:] print wfq_mx wfq_mx.to_csv(file_name.split('.')[0] + suffix + '.' + file_name.split('.')[1], sep='\t')
def record_match_data(min_seq): matches = db.match.find({"match_seq_num": { '$gt': min_seq } }) # import pdb; pdb.set_trace() for match in matches: if match["human_players"] == 10 and match["duration"] > 1200: data_frame = DataFrame(match["players"]) radiant_heroes = data_frame[data_frame['player_slot']<128]['hero_id'].tolist() dire_heroes = data_frame[data_frame['player_slot']>=128]['hero_id'].tolist() for index, row in data_frame.iterrows(): if (row["player_slot"] < 128): radiant_heroes.remove(row["hero_id"]) teammate = radiant_heroes opponent = dire_heroes is_win = bool(match["radiant_win"]) else: dire_heroes.remove(row["hero_id"]) teammate = dire_heroes opponent = radiant_heroes is_win = not bool(match["radiant_win"]) record_json = json.loads(row.to_json()) record_json['win'] = is_win record_json['match_id'] = match['match_id'] record_json['match_seq'] = match['match_seq_num'] record_json['teammate'] = teammate record_json['opponent'] = opponent item = [] for x in range(0,6): if record_json["item_{}".format(x)]>0: if "item_{}" in record_json: item.append(record_json["item_{}".format(x)]) del record_json["item_{}".format(x)] if "item_{}_name" in record_json: del record_json["item_{}_name".format(x)] record_json['item'] = item count = statics_db.match_record.find({'$and':[{'hero_id':row['hero_id']},{'match_id':match['match_id']}]}).count() if count == 0: statics_db.match_record.insert_one(record_json) max_solved_seq_num = max(statics_db.max_solved_seq_num.find({"value_name":"max_solved_seq_num"})[0]["value"],match["match_seq_num"]) statics_db.max_solved_seq_num.update_one( {"value_name":"max_solved_seq_num"}, { "$set": { "value":max_solved_seq_num }, "$currentDate": {"lastModified": True} } ) logging.info("match handle:"+str(max_solved_seq_num))
def request_player_info_1(self, response): r_json = json.loads(response.body_as_unicode()) result_set = r_json[u'resultSets'][0] df = DataFrame(data=result_set[u'rowSet'], columns=result_set[u'headers']).set_index('PERSON_ID') for id_, data in df.iterrows(): p = PlayerItem() p['nba_player_id'] = id_ p['nba_player_code'] = data['PLAYERCODE'] p['is_active'] = bool(data['ROSTERSTATUS']) yield FormRequest( url = 'http://stats.nba.com/stats/commonplayerinfo/', method = 'GET', formdata = {'PlayerID': str(id_)}, meta = dict(player=p), callback = self.request_player_info_2 )
def train(self, training_data): preds = DataFrame(training_data['prediction']) preds['actual'] = training_data['actual'] pred_cols = len(training_data['prediction'].keys()) results = DataFrame() for row in preds.iterrows(): index, data = row results = results.append( data[range(pred_cols)] == data['actual'] ) for k in preds[range(pred_cols)]: self.weights[k] = 1/variance(1-results[k]) # If we have infinte weights make them 2* the sum of the other if any(x == inf for x in self.weights): tot_weight = sum( [x for x in self.weights.values() if x != inf] ) for wk in self.weights: if self.weights[wk] == inf: self.weights[wk] = 2*tot_weight
def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=['foo', 'bar', 'baz', 'qux']) dicts = [x.to_dict() for idx, x in df.iterrows()] result = df.append(dicts, ignore_index=True) expected = df.append(df, ignore_index=True) assert_frame_equal(result, expected) # different columns dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] result = df.append(dicts, ignore_index=True, sort=True) expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) assert_frame_equal(result, expected)
def test_mixed_index_at_iat_loc_iloc_dataframe(self): # GH 19860 df = DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]], columns=['a', 'b', 'c', 1, 2]) for rowIdx, row in df.iterrows(): for el, item in row.iteritems(): assert df.at[rowIdx, el] == df.loc[rowIdx, el] == item for row in range(2): for i in range(5): assert df.iat[row, i] == df.iloc[row, i] == row * 5 + i with pytest.raises(KeyError): df.at[0, 3] with pytest.raises(KeyError): df.loc[0, 3]
def load_wkt_layer_from_dataframe(self, df: DataFrame, wkt_column_name: str, color: Union[str, List[str]] = '#0078d7', fill_color: Union[str, List[str]] = '#0048a7', fill_alpha: float = 0.2, group_name=None, change_bounds_on_click=False, pop_up: bool = True): """ loads additional layer to the map Args: df (Pandas dataframe,default=False) dataframe with at least one geographic column wkt_column_name (str,default=False) the name of the geometry column to show the geometries should be in the format of wkt string if you are using oracle, select the sdo_geometry with the oracle function sdo_util.to_wktgeometry color (str,default=True) the color to use when drawing the geoms on the map examples - blue,white,#0078d7, #9999d9 fill_color(str,default=True) the color to fill when its complex geometry as polygon fill_alpha(float) the opacity of the fill color, between 0 and 1 group_name: (str,default=None) will take geometries and create a group in the LayerControl, gives a specific name for the group. If None then does'nt group the geometries. change_bounds_on_click: whether a mouse-click on change the map bounds to the object pop_up: whether a mouse-click on the object will pop-up """ part_func = lambda x, color_index: {'color': color[color_index] if type(color) is not str else color, 'fillColor': fill_color[color_index] if type( fill_color) is not str else fill_color, 'fillOpacity': fill_alpha} object_to_add = self.map if group_name is not None: object_to_add = FeatureGroup(name=group_name) for index, row in df.iterrows(): geom_dict = wkt.loads(row[wkt_column_name]) shp_geom = loads(row[wkt_column_name]) row_formatted = "" for index_c, column in enumerate(row.index.values): if column == wkt_column_name: pass else: row_formatted += "<b>{}</b>: {} <br/>".format(column, row[column]) row_formatted += "<b>{}</b>: {} <br/>".format("GEOM CENTROID", shp_geom.centroid) feature = (folium.GeoJson if change_bounds_on_click else NoClickGeoJson)( geom_dict, style_function=partial(part_func, color_index=index)) if pop_up: popup = folium.Popup(row_formatted.replace("'", "\"")) popup.add_to(feature) feature.add_to(object_to_add) if group_name is not None: object_to_add.add_to(self.map)
class DifferentialFVA(StrainDesignMethod): r"""Differential flux variability analysis. Compares flux ranges of a reference model to a set of models that have been parameterized to lie on a grid of evenly spaced points in the n-dimensional production envelope (n being the number of reaction bounds to be varied). :: production ^ |---------. * reference_model | . . . . .\ . design_space_model | . . . . . \ | . . . . . .\ | . . . . . . \ o--------------*- > growth Overexpression, downregulation, knockout, flux-reversal and other strain engineering targets can be inferred from the resulting comparison. Parameters ---------- design_space_model : cobra.Model A model whose flux ranges will be scanned. objective : str or Reaction or Metabolite A reaction whose flux or a metabolite whose production should be maximized. variables : iterable, optional A iterable of n reactions (or IDs) to be scanned (defaults to current objective in design_space_model). reference_model : cobra.Model, optional A model whose flux ranges represent the reference state and all calculated flux ranges will be compared to. Defaults to design_space_model constrained to its maximum objective value. exclude : iterable An iterable of reactions (or IDs) to be excluded in the analysis (exchange reactions will not be analyzed automatically). normalize_ranges_by : str or Reaction, optional A reaction ID that specifies a flux by whom all calculated flux ranges will be normalized by. points : int, optional Number of points to lay on the surface of the n-dimensional production envelope (defaults to 10). Examples -------- >>> from cameo import models >>> from cameo.strain_design.deterministic import DifferentialFVA >>> model = models.bigg.e_coli_core >>> reference_model = model.copy() >>> reference_model.reactions.Biomass_Ecoli_core_w_GAM.lower_bound = reference_model.optimize().objective_value >>> diffFVA = DifferentialFVA(design_space_model=model, reference_model=reference_model, objective=model.reactions.EX_succ_e, variables=[model.reactions.Biomass_Ecoli_core_w_GAM], normalize_ranges_by=model.reactions.Biomass_Ecoli_core_w_GAM, points=10) >>> result = diffFVA.run(surface_only=True) >>> result.plot() """ def __init__(self, design_space_model, objective, variables=None, reference_model=None, exclude=(), normalize_ranges_by=None, points=10): super(DifferentialFVA, self).__init__() self.design_space_model = design_space_model self.design_space_nullspace = nullspace( create_stoichiometric_array(self.design_space_model)) if reference_model is None: self.reference_model = self.design_space_model.copy() fix_objective_as_constraint(self.reference_model) self.reference_nullspace = self.design_space_nullspace else: self.reference_model = reference_model self.reference_nullspace = nullspace( create_stoichiometric_array(self.reference_model)) if isinstance(objective, Reaction): self.objective = objective.id elif isinstance(objective, Metabolite): try: self.reference_model.add_boundary(objective, type='demand') except ValueError: pass try: self.objective = self.design_space_model.add_boundary( objective, type='demand').id except ValueError: self.objective = self.design_space_model.reactions.get_by_id( "DM_" + objective.id).id elif isinstance(objective, six.string_types): self.objective = objective else: raise ValueError( 'You need to provide an objective as a Reaction, Metabolite or a reaction id' ) if variables is None: # try to establish the current objective reaction obj_var_ids = [ variable.name for variable in self.design_space_model.objective.expression.free_symbols ] obj_var_ids = [re.sub('_reverse.*', '', id) for id in obj_var_ids] if len(set(obj_var_ids)) != 1: raise ValueError( "The current objective in design_space_model is not a single reaction objective. " "DifferentialFVA does not support composite objectives.") else: self.variables = [ self.design_space_model.reactions.get_by_id( obj_var_ids[0]).id ] else: self.variables = list() for variable in variables: if isinstance(variable, Reaction): self.variables.append(variable.id) else: self.variables.append(variable) self.exclude = list() for elem in exclude: if isinstance(elem, Reaction): self.exclude.append(elem.id) else: self.exclude.append(elem) design_space_blocked_reactions = find_blocked_reactions_nullspace( self.design_space_model, self.design_space_nullspace) self.exclude += [ reaction.id for reaction in design_space_blocked_reactions ] reference_blocked_reactions = find_blocked_reactions_nullspace( self.reference_model, self.reference_nullspace) self.exclude += [ reaction.id for reaction in reference_blocked_reactions ] self.exclude += [ reaction.id for reaction in self.design_space_model.exchanges ] self.exclude += [ reaction.id for reaction in self.reference_model.exchanges ] self.exclude += [ reaction.id for reaction in self.design_space_model.reactions if _BIOMASS_RE_.match(reaction.id) ] self.exclude = set(self.exclude) self.points = points self.envelope = None self.grid = None self.reference_flux_ranges = None self.reference_flux_dist = None if isinstance(normalize_ranges_by, Reaction): self.normalize_ranges_by = normalize_ranges_by.id else: self.normalize_ranges_by = normalize_ranges_by @staticmethod def _interval_overlap(interval1, interval2): return min(interval1[1] - interval2[0], interval2[1] - interval1[0]) @classmethod def _interval_gap(cls, interval1, interval2): overlap = cls._interval_overlap(interval1, interval2) if overlap >= 0: return 0 else: if abs(interval1[1]) > abs(interval2[1]): return overlap else: return -1 * overlap def _init_search_grid(self, surface_only=False, improvements_only=True): """Initialize the grid of points to be scanned within the production envelope.""" self.envelope = phenotypic_phase_plane(self.design_space_model, self.variables, objective=self.objective, points=self.points) intervals = self.envelope[[ 'objective_lower_bound', 'objective_upper_bound' ]].copy() intervals['objective_lower_bound'] = float_floor( intervals.objective_lower_bound, ndecimals) intervals['objective_upper_bound'] = float_ceil( intervals.objective_upper_bound, ndecimals) max_distance = 0. max_interval = None for i, (lb, ub) in intervals.iterrows(): distance = abs(ub - lb) if distance > max_distance: max_distance = distance max_interval = (lb, ub) step_size = (max_interval[1] - max_interval[0]) / (self.points - 1) grid = list() minimal_reference_production = self.reference_flux_ranges[ 'lower_bound'][self.objective] for i, row in self.envelope.iterrows(): variables = row[self.variables] lb = row.objective_lower_bound if improvements_only: lb = max(lb, minimal_reference_production) + step_size ub = row.objective_upper_bound if not surface_only: coordinate = lb while coordinate < ub: grid.append(list(variables.values) + [coordinate]) coordinate += step_size if improvements_only and ub <= minimal_reference_production: continue else: grid.append(list(variables.values) + [ub]) columns = self.variables + [self.objective] self.grid = DataFrame(grid, columns=columns) def run(self, surface_only=True, improvements_only=True, progress=True, view=None): """Run the differential flux variability analysis. Parameters ---------- surface_only : bool, optional If only the surface of the n-dimensional production envelope should be scanned (defaults to True). improvements_only : bool, optional If only grid points should should be scanned that constitute and improvement in production over the reference state (defaults to True). progress : bool, optional If a progress bar should be shown. view : SequentialView or MultiprocessingView or ipython.cluster.DirectView, optional A parallelization view (defaults to SequentialView). Returns ------- pandas.Panel A pandas Panel containing a results DataFrame for every grid point scanned. """ with TimeMachine() as tm: # Make sure that the design_space_model is initialized to its original state later for variable in self.variables: reaction = self.design_space_model.reactions.get_by_id( variable) tm(do=int, undo=partial(setattr, reaction, 'lower_bound', reaction.lower_bound)) tm(do=int, undo=partial(setattr, reaction, 'upper_bound', reaction.upper_bound)) target_reaction = self.design_space_model.reactions.get_by_id( self.objective) tm(do=int, undo=partial(setattr, target_reaction, 'lower_bound', target_reaction.lower_bound)) tm(do=int, undo=partial(setattr, target_reaction, 'upper_bound', target_reaction.upper_bound)) if view is None: view = config.default_view else: view = view included_reactions = [ reaction.id for reaction in self.reference_model.reactions if reaction.id not in self.exclude ] + self.variables + [self.objective] self.reference_flux_dist = pfba(self.reference_model, fraction_of_optimum=0.99) self.reference_flux_ranges = flux_variability_analysis( self.reference_model, reactions=included_reactions, view=view, remove_cycles=False, fraction_of_optimum=0.75).data_frame self._init_search_grid(surface_only=surface_only, improvements_only=improvements_only) func_obj = _DifferentialFvaEvaluator(self.design_space_model, self.variables, self.objective, included_reactions) if progress: progress = ProgressBar(len(self.grid)) results = list( progress(view.imap(func_obj, self.grid.iterrows()))) else: results = list(view.map(func_obj, self.grid.iterrows())) solutions = dict((tuple(point.iteritems()), fva_result) for (point, fva_result) in results) reference_intervals = self.reference_flux_ranges[[ 'lower_bound', 'upper_bound' ]].values for sol in six.itervalues(solutions): intervals = sol[['lower_bound', 'upper_bound']].values gaps = [ self._interval_gap(interval1, interval2) for interval1, interval2 in my_zip(reference_intervals, intervals) ] sol['gaps'] = gaps if self.normalize_ranges_by is not None: normalizer = sol.lower_bound[self.normalize_ranges_by] if normalizer > non_zero_flux_threshold: normalized_intervals = sol[['lower_bound', 'upper_bound' ]].values / normalizer sol['normalized_gaps'] = [ self._interval_gap(interval1, interval2) for interval1, interval2 in my_zip( reference_intervals, normalized_intervals) ] else: sol['normalized_gaps'] = [numpy.nan] * len(sol.lower_bound) else: sol['normalized_gaps'] = gaps ref_upper_bound = self.reference_flux_ranges.upper_bound.apply( lambda v: 0 if abs(v) < non_zero_flux_threshold else v) ref_lower_bound = self.reference_flux_ranges.lower_bound.apply( lambda v: 0 if abs(v) < non_zero_flux_threshold else v) collection = list() for key, df in six.iteritems(solutions): df['biomass'] = key[0][1] df['production'] = key[1][1] df['KO'] = False df['flux_reversal'] = False df['suddenly_essential'] = False df['free_flux'] = False df.loc[(df.lower_bound == 0) & (df.upper_bound == 0) & (ref_upper_bound != 0) & (ref_lower_bound != 0), 'KO'] = True df.loc[((ref_upper_bound < 0) & (df.lower_bound > 0) | ((ref_lower_bound > 0) & (df.upper_bound < 0))), 'flux_reversal'] = True df.loc[((df.lower_bound <= 0) & (df.lower_bound > 0)) | ((ref_lower_bound >= 0) & (df.upper_bound <= 0)), 'suddenly_essential'] = True is_reversible = numpy.asarray([ self.design_space_model.reactions.get_by_id(i).reversibility for i in df.index ], dtype=bool) not_reversible = numpy.logical_not(is_reversible) df.loc[((df.lower_bound == -1000) & (df.upper_bound == 1000) & is_reversible) | ((df.lower_bound == 0) & (df.upper_bound == 1000) & not_reversible) | ((df.lower_bound == -1000) & (df.upper_bound == 0) & not_reversible), 'free_flux'] = True df['reaction'] = df.index df['excluded'] = df['reaction'].isin(self.exclude) collection.append(df) # multi_index = [(key[0][1], key[1][1]) for key in solutions] # solutions_multi_index = pandas.concat(list(solutions.values()), # axis=0, keys=multi_index)# # solutions_multi_index.index.set_names(['biomass', 'production', # 'reaction'], inplace=True) total = pandas.concat(collection, ignore_index=True, copy=False) total.sort_values(['biomass', 'production', 'reaction'], inplace=True) total.index = total['reaction'] return DifferentialFVAResult(total, self.envelope, self.reference_flux_ranges, self.reference_flux_dist)
def calcualteAverageSimilarity(similarityDataFrame: DataFrame) -> Series: averageSimilarityList = [None] * len(similarityDataFrame) for index, row in similarityDataFrame.iterrows(): averageSimilarityList[index] = row.mean() return Series(averageSimilarityList)
def process_movie_keywords(session, data: pd.DataFrame): """ Attaches genre keywords to movie records """ movie_title_index = {} movie_index = src.controller.movie.MovieLookupIndex(logger).query() genre_index = src.controller.fields.GenreIndexLookup(logger).query() keyword_index = src.controller.fields.PlotKeywordIndexLookup( logger).query() actor_index = src.controller.person.PersonIndexLookup(logger).query() print('Updating movie mappings') for i, record in data.iterrows(): record_no = i + 1 if record_no % 500 == 0: print('\tProcessing record #%s' % record_no) # Get searchable title+year of movie record movie_title = record['movie_title'].strip() movie_title_l = movie_title.lower() movie_year = record['title_year'] if pd.isna(movie_title_l): continue if pd.isna(movie_year): movie_year = '' else: movie_year = str(movie_year) movie_record_index = (movie_title_l, movie_year) if movie_record_index in movie_title_index: continue else: # Mark movie by record number in dataframe movie_title_index[movie_title_l] = record_no # Get movies, keywords, and genre ids movie_pk = movie_index[movie_record_index] if pd.isna(record['genres']): genre_names = [] else: genre_names = [ name.strip().lower() for name in record['genres'].split('|') ] genre_pks = [genre_index[name] for name in genre_names] if pd.isna(record['plot_keywords']): keyword_names = [] else: keyword_names = [ name.strip().lower() for name in record['plot_keywords'].split('|') ] keyword_pks = [keyword_index[name] for name in keyword_names] # Get actor ids actor_names = [] for actor_field in ('actor_1_name', 'actor_2_name', 'actor_3_name'): if not pd.isna(record[actor_field]): actor_names.append(record[actor_field].strip().lower()) actor_pks = [actor_index[name] for name in actor_names] # Add genres and keywords to movie src.controller.movie.AttachMovieGenre(logger=logger, session=session, commit_enabled=False).execute( movie_pk=movie_pk, genre_pks=genre_pks) src.controller.movie.AttachMoviePlotKeywords( logger=logger, session=session, commit_enabled=False).execute(movie_pk=movie_pk, keyword_pks=keyword_pks) # Add actors to movie src.controller.movie.AttachMovieActors(logger=logger, session=session, commit_enabled=False).execute( movie_pk=movie_pk, actor_pks=actor_pks) session.commit()
class Iteration: # mem_itertuples_* benchmarks are slow timeout = 120 def setup(self): N = 1000 self.df = DataFrame(np.random.randn(N * 10, N)) self.df2 = DataFrame(np.random.randn(N * 50, 10)) self.df3 = DataFrame(np.random.randn(N, 5 * N), columns=["C" + str(c) for c in range(N * 5)]) self.df4 = DataFrame(np.random.randn(N * 1000, 10)) def time_items(self): # (monitor no-copying behaviour) if hasattr(self.df, "_item_cache"): self.df._item_cache.clear() for name, col in self.df.items(): pass def time_items_cached(self): for name, col in self.df.items(): pass def time_iteritems_indexing(self): for col in self.df3: self.df3[col] def time_itertuples_start(self): self.df4.itertuples() def time_itertuples_read_first(self): next(self.df4.itertuples()) def time_itertuples(self): for row in self.df4.itertuples(): pass def time_itertuples_to_list(self): list(self.df4.itertuples()) def mem_itertuples_start(self): return self.df4.itertuples() def peakmem_itertuples_start(self): self.df4.itertuples() def mem_itertuples_read_first(self): return next(self.df4.itertuples()) def peakmem_itertuples(self): for row in self.df4.itertuples(): pass def mem_itertuples_to_list(self): return list(self.df4.itertuples()) def peakmem_itertuples_to_list(self): list(self.df4.itertuples()) def time_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def time_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def time_itertuples_raw_tuples(self): for row in self.df4.itertuples(index=False, name=None): pass def time_itertuples_raw_tuples_to_list(self): list(self.df4.itertuples(index=False, name=None)) def mem_itertuples_raw_start(self): return self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_start(self): self.df4.itertuples(index=False, name=None) def peakmem_itertuples_raw_read_first(self): next(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw(self): for row in self.df4.itertuples(index=False, name=None): pass def mem_itertuples_raw_to_list(self): return list(self.df4.itertuples(index=False, name=None)) def peakmem_itertuples_raw_to_list(self): list(self.df4.itertuples(index=False, name=None)) def time_iterrows(self): for row in self.df.iterrows(): pass
def present(duration=120, eeg=None, save_fn=None): n_trials = 2010 iti = 0.4 soa = 0.3 jitter = 0.2 record_duration = np.float32(duration) markernames = [1, 2] # Setup trial list image_type = np.random.binomial(1, 0.5, n_trials) trials = DataFrame( dict(image_type=image_type, timestamp=np.zeros(n_trials))) def load_image(fn): return visual.ImageStim(win=mywin, image=fn) # Setup graphics mywin = visual.Window([1600, 900], monitor='testMonitor', units="deg", fullscr=True) targets = list(map(load_image, glob(os.path.join(CAT_DOG, 'target-*.jpg')))) nontargets = list( map(load_image, glob(os.path.join(CAT_DOG, 'nontarget-*.jpg')))) stim = [nontargets, targets] # start the EEG stream, will delay 5 seconds to let signal settle if eeg: if save_fn is None: # If no save_fn passed, generate a new unnamed save file save_fn = generate_save_fn(eeg.device_name, 'visual_p300', 'unnamed') print( f'No path for a save file was passed to the experiment. Saving data to {save_fn}' ) eeg.start(save_fn, duration=record_duration) # Show instructions show_instructions(duration=duration) # Iterate through the events start = time() for ii, trial in trials.iterrows(): # Inter trial interval core.wait(iti + np.random.rand() * jitter) # Select and display image label = trials['image_type'].iloc[ii] image = choice(targets if label == 1 else nontargets) image.draw() # Push sample if eeg: timestamp = time() if eeg.backend == 'muselsl': marker = [markernames[label]] else: marker = markernames[label] eeg.push_sample(marker=marker, timestamp=timestamp) mywin.flip() # offset core.wait(soa) mywin.flip() if len(event.getKeys()) > 0 or (time() - start) > record_duration: break event.clearEvents() # Cleanup if eeg: eeg.stop() mywin.close()
def present(duration=365, eeg=None, save_fn=None, iti=0., soa=1.0, jitter=0., n_trials=180, cf1=1000, amf1=40): # Create markers stream outlet info = StreamInfo("Markers", "Markers", 1, 0, "int32", "myuidw43536") outlet = StreamOutlet(info) markernames = [1] start = time() # Set up trial parameters record_duration = np.float32(duration) # Set up trial list stim_freq = np.zeros((n_trials, ), dtype=int) trials = DataFrame(dict(stim_freq=stim_freq, timestamp=np.zeros(n_trials))) # Setup graphics mywin = visual.Window([1920, 1080], monitor="testMonitor", units="deg", fullscr=True) fixation = visual.GratingStim(win=mywin, size=0.2, pos=[0, 0], sf=0, rgb=[1, 0, 0]) fixation.setAutoDraw(True) # Generate stimuli am1 = generate_am_waveform(cf1, amf1, secs=soa, sample_rate=44100) aud1 = sound.Sound(am1) aud1.setVolume(0.8) auds = [aud1] mywin.flip() # Show the instructions screen show_instructions(10) # start the EEG stream= if eeg: eeg.start(save_fn, duration=record_duration) for ii, trial in trials.iterrows(): # Intertrial interval core.wait(iti + np.random.rand() * jitter) # Select stimulus frequency ind = trials["stim_freq"].iloc[ii] auds[ind].stop() auds[ind].play() # Push sample if eeg: timestamp = time() if eeg.backend == "muselsl": marker = [markernames[ind]] marker = list(map(int, marker)) else: marker = markernames[ind] eeg.push_sample(marker=marker, timestamp=timestamp) # offset core.wait(soa) mywin.flip() if len(event.getKeys()) > 0: break if (time() - start) > record_duration: break event.clearEvents() # Cleanup if eeg: eeg.stop() mywin.close()
def get_analytical_parameter_table( hierarchical_candidate_ids: list, parameter_type: str, condition_id_to_index: Dict[str, int], measurement_df: pd.DataFrame, observable_ids, condition_map, no_preeq_condition_idx: int) -> List[Tuple[int, int, int]]: """Generate (scalingIdx, conditionIdx, observableIdx) table for all occurrences of the given parameter names. Parameters: hierarchical_candidate_ids: Ids of optimization parameters for hierarchical optimization. This table depends on ordering of this list. parameter_type: 'observable' or 'noise' Returns: list of (scalingIdx, conditionIdx, observableIdx) tuples """ # need list, not ndarray condition_map_list = [list(x) for x in condition_map] if parameter_type == 'observable': def _get_overrides(): return split_parameter_replacement_list(row.observableParameters) elif parameter_type == 'noise': def _get_overrides(): return split_parameter_replacement_list(row.noiseParameters) else: raise ValueError("parameter_type must be 'noise' or " f"'observable', but got {parameter_type}") use = [] for _, row in measurement_df.iterrows(): overrides = _get_overrides() sim_cond_idx = \ condition_id_to_index[row.simulationConditionId] preeq_cond_idx = no_preeq_condition_idx if not isnan(row.preequilibrationConditionId): preeq_cond_idx = condition_id_to_index[ row.preequilibrationConditionId] for s in overrides: # print(s, parametersForHierarchical) try: candidate_idx = hierarchical_candidate_ids.index(s) except ValueError: continue # current parameter not in list condition_idx = condition_map_list.index( [preeq_cond_idx, sim_cond_idx]) observable_idx = observable_ids.index(row.observableId) tup = (candidate_idx, condition_idx, observable_idx) # Don't add a new line for each timepoint # We don't allow separate parameters for individual time-points # (Can be implemented via different observables) if tup not in use: use.append(tup) if not len(use): raise AssertionError("Candidates were: " f"{hierarchical_candidate_ids} but nothing " "usable found") return use
def df_to_hover_text(df: pd.DataFrame): return [row_to_hover_text(row) for _, row in df.iterrows()]
def test_unstack_nan_index(self): # GH7466 def cast(val): val_str = "" if val != val else val return f"{val_str:1}" def verify(df): mk_list = lambda a: list(a) if isinstance(a, tuple) else [a] rows, cols = df.notna().values.nonzero() for i, j in zip(rows, cols): left = sorted(df.iloc[i, j].split(".")) right = mk_list(df.index[i]) + mk_list(df.columns[j]) right = sorted(map(cast, right)) assert left == right df = DataFrame( { "jim": ["a", "b", np.nan, "d"], "joe": ["w", "x", "y", "z"], "jolie": ["a.w", "b.x", " .y", "d.z"], } ) left = df.set_index(["jim", "joe"]).unstack()["jolie"] right = df.set_index(["joe", "jim"]).unstack()["jolie"].T tm.assert_frame_equal(left, right) for idx in itertools.permutations(df.columns[:2]): mi = df.set_index(list(idx)) for lev in range(2): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == len(df) verify(udf["jolie"]) df = DataFrame( { "1st": ["d"] * 3 + [np.nan] * 5 + ["a"] * 2 + ["c"] * 3 + ["e"] * 2 + ["b"] * 5, "2nd": ["y"] * 2 + ["w"] * 3 + [np.nan] * 3 + ["z"] * 4 + [np.nan] * 3 + ["x"] * 3 + [np.nan] * 2, "3rd": [ 67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59, 50, 62, 59, 76, 52, 14, 53, 60, 51, ], } ) df["4th"], df["5th"] = ( df.apply(lambda r: ".".join(map(cast, r)), axis=1), df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1), ) for idx in itertools.permutations(["1st", "2nd", "3rd"]): mi = df.set_index(list(idx)) for lev in range(3): udf = mi.unstack(level=lev) assert udf.notna().values.sum() == 2 * len(df) for col in ["4th", "5th"]: verify(udf[col]) # GH7403 df = pd.DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)}) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [ [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7], ] vals = list(map(list, zip(*vals))) idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B") cols = MultiIndex( levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)}) df.iloc[2, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]] cols = MultiIndex( levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] ) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) df = pd.DataFrame( {"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)} ) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack(0) vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]] cols = MultiIndex( levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"] ) idx = Index([np.nan, 0, 1, 2, 3], name="B") right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH7401 df = pd.DataFrame( { "A": list("aaaaabbbbb"), "B": (date_range("2012-01-01", periods=5).tolist() * 2), "C": np.arange(10), } ) df.iloc[3, 1] = np.NaN left = df.set_index(["A", "B"]).unstack() vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]]) idx = Index(["a", "b"], name="A") cols = MultiIndex( levels=[["C"], date_range("2012-01-01", periods=5)], codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]], names=[None, "B"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) # GH4862 vals = [ ["Hg", np.nan, np.nan, 680585148], ["U", 0.0, np.nan, 680585148], ["Pb", 7.07e-06, np.nan, 680585148], ["Sn", 2.3614e-05, 0.0133, 680607017], ["Ag", 0.0, 0.0133, 680607017], ["Hg", -0.00015, 0.0133, 680607017], ] df = DataFrame( vals, columns=["agent", "change", "dosage", "s_id"], index=[17263, 17264, 17265, 17266, 17267, 17268], ) left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack() vals = [ [np.nan, np.nan, 7.07e-06, np.nan, 0.0], [0.0, -0.00015, np.nan, 2.3614e-05, np.nan], ] idx = MultiIndex( levels=[[680585148, 680607017], [0.0133]], codes=[[0, 1], [-1, 0]], names=["s_id", "dosage"], ) cols = MultiIndex( levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]], codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]], names=[None, "agent"], ) right = DataFrame(vals, columns=cols, index=idx) tm.assert_frame_equal(left, right) left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"]) tm.assert_frame_equal(left.unstack(), right) # GH9497 - multiple unstack with nulls df = DataFrame( { "1st": [1, 2, 1, 2, 1, 2], "2nd": pd.date_range("2014-02-01", periods=6, freq="D"), "jim": 100 + np.arange(6), "joe": (np.random.randn(6) * 10).round(2), } ) df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02") df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"]) assert left.notna().values.sum() == 2 * len(df) for col in ["jim", "joe"]: for _, r in df.iterrows(): key = r["1st"], (col, r["2nd"], r["3rd"]) assert r[col] == left.loc[key]
def caltrack(df: pd.DataFrame, cali: (list, str) = None, bit: (list, str) = None, lims: (list) = None, cal_lim: list = [5, 20], dtick: bool = False, fill: bool = False, fontsize: int = 8, grid_numbers: list = [11, 51], steps: list = None, correlation: pd.DataFrame = None, ax=None, cal_kw: dict = {}, corr_kw: dict = {}, bit_kw: dict = {}, depth_ref: str = 'md', cal_colormap: str = 'winter', bit_colormap: str = 'bone'): """caltrack [summary] Parameters ---------- df : pd.DataFrame [description] cali : [type], optional [description], by default None bit : [type], optional [description], by default None lims : [type], optional [description], by default None cal_lim : list, optional [description], by default [5,20] dtick : bool, optional [description], by default False fill : bool, optional [description], by default False fontsize : int, optional [description], by default 8 grid_numbers : list, optional [description], by default [11,51] steps : list, optional [description], by default None correlation : pd.DataFrame, optional [description], by default None ax : [type], optional [description], by default None cal_kw : dict, optional [description], by default {} corr_kw : dict, optional [description], by default {} bit_kw : dict, optional [description], by default {} depth_ref : str, optional [description], by default 'md' cal_colormap : str, optional [description], by default 'winter' bit_colormap : str, optional [description], by default 'bone' """ assert isinstance(df, pd.DataFrame) assert depth_ref in ['md', 'tvd', 'tvdss' ], "depth_ref can only be one of ['md','tvd','tvdss']" cal = ax or plt.gca() def_cal_kw = {'color': 'black', 'linestyle': '-', 'linewidth': 1} for (k, v) in def_cal_kw.items(): if k not in cal_kw: cal_kw[k] = v def_bit_kw = {'color': 'darkred', 'linestyle': '--', 'linewidth': 2} for (k, v) in def_bit_kw.items(): if k not in bit_kw: bit_kw[k] = v def_corr_kw = {'color': 'red', 'linestyle': '--', 'linewidth': 2} for (k, v) in def_corr_kw.items(): if k not in corr_kw: corr_kw[k] = v #Set lims if lims == None: #Depth Limits lims = [df.index.min(), df.index.max()] cal.set_ylim([lims[1], lims[0]]) #Set the vertical grid spacing if steps is None: mayor_grid = np.linspace(lims[0], lims[1], grid_numbers[0]) minor_grid = np.linspace(lims[0], lims[1], grid_numbers[1]) else: mayor_grid = np.arange(lims[0], lims[1], steps[0]) minor_grid = np.arange(lims[0], lims[1], steps[1]) depth = df.index if depth_ref == 'md' else df[depth_ref] if cali is not None: if isinstance(cali, str): cal.plot(df[cali], depth, **cal_kw) #Plotting elif isinstance(cali, list): cmap = mpl.cm.get_cmap(cal_colormap, len(cal)) for i, c in enumerate(cal): cal_kw['color'] = cmap(i) cal.plot(df[c], depth, **cal_kw) if bit is not None: cal.plot(df[bit], depth, **bit_kw) cal.set_xlim(cal_lim) cal.set_xlabel("Caliper [in]") cal.set_xticks(np.linspace(cal_lim[0], cal_lim[1], 4)) cal.set_xticklabels( np.round(np.linspace(cal_lim[0], cal_lim[1], 4), decimals=1)) cal.xaxis.tick_top() cal.xaxis.set_label_position("top") cal.tick_params("both", labelsize=fontsize) cal.set_yticks(mayor_grid) cal.set_yticks(minor_grid, minor=True) if dtick == True: cal.set_yticklabels(mayor_grid) else: cal.set_yticklabels([]) if fill == True: cal.fill_betweenx(depth, df[cali], df[bit], where=(df[cali] > df[bit]), color="orange") cal.fill_betweenx(depth, df[cali], df[bit], where=(df[cali] < df[bit]), color="gray") #Add Correlation Line if correlation is not None: cor_ann = corr_kw.pop('ann', False) for i in correlation.iterrows(): cal.hlines(i[1]['depth'], 0, 1, **corr_kw) if cor_ann: try: cal.annotate(f"{i[1]['depth']} - {i[1]['comment']} ", xy=(16 - 3, i[1]['depth'] - 1), xycoords='data', horizontalalignment='right', bbox={ 'boxstyle': 'roundtooth', 'fc': '0.8' }) except: cal.annotate(f"{i[1]['depth']}", xy=(16 - 3, i[1]['depth'] - 1), xycoords='data', horizontalalignment='right', bbox={ 'boxstyle': 'roundtooth', 'fc': '0.8' })
def get_ef_dict(self, ef_data: pd.DataFrame): ef_data[["Slope", "Load"]] = ef_data[["Slope", "Load"]].fillna(0.0) return {(row["VehicleName"], row["Pollutant"], row["Slope"], row["Load"]): row.to_dict() for _, row in ef_data.iterrows()}
def train_generator(months, overall, label_month, predict=False): datas = pd.read_csv("../references.csv", dtype='object') datas = datas.fillna(0) indexs = datas['U_I_overall_qty_' + overall].as_matrix().tolist() vps = [] for i in indexs: if i == 0: continue tmp = i.split('-') # tmp[0]是vipno,tmp[1]是pluno vps.append([tmp[0], tmp[1]]) vps = np.array(vps) feature_names = feature_name_generator(months, overall, predict) train_datas = DataFrame(np.zeros(shape=(len(vps), len(feature_names))), columns=feature_names, dtype='float') # tmp = DataFrame(vps, columns=['vipno', 'pluno'], dtype='object') # print(tmp) train_datas = pd.concat( [train_datas, DataFrame(vps, columns=['vipno', 'pluno'])], axis=1) # 不同的阵容,存储的格式不一样,所以分开处理 start = datetime.datetime.now() train_datas.set_index(['vipno', 'pluno'], inplace=True, drop=False) for f in feature_names[:8]: ds = datas[f].as_matrix().tolist() # count = 0 for row in ds: if row == 0: continue tmp = row.split('-') # print(count) # count += 1 train_datas.loc[(tmp[0], tmp[1]), f] = float(tmp[2]) print(datetime.datetime.now() - start) print("***************") start = datetime.datetime.now() train_datas.set_index(['pluno'], inplace=True, drop=False) for f in feature_names[8:12]: # count = 0 ds = datas[f].as_matrix().tolist() for row in ds: if row == 0: continue tmp = row.split('-') # print(count) # count += 1 train_datas.loc[tmp[0], f] = float(tmp[1]) print(datetime.datetime.now() - start) print("***************") start = datetime.datetime.now() for f in feature_names[24:28]: ds = datas[f].as_matrix().tolist() # count = 0 for row in ds: if row == 0: continue tmp = row.split('-') # print(count) # count += 1 train_datas.loc[tmp[0], f] = float(tmp[1]) print(datetime.datetime.now() - start) print("***************") train_datas.set_index(['vipno'], inplace=True, drop=False) # print(train_datas.index) start = datetime.datetime.now() for f in feature_names[12:24]: ds = datas[f].as_matrix().tolist() # count = 0 for row in ds: if row == 0: continue tmp = row.split('-') # print(count) # count += 1 # print(tmp[1]) train_datas.loc[tmp[0], f] = float(tmp[1]) print(datetime.datetime.now() - start) print("***************") # months = ['02', '03', '04'] start = datetime.datetime.now() train_datas.set_index(['vipno', 'pluno'], inplace=True, drop=False) for index, row in train_datas.iterrows(): tmp = [] for m in months: tmp.append(row['U_I_month_count_' + m]) tmp.sort() train_datas.loc[(row['vipno'], row['pluno']), feature_names[28]] = np.array(tmp).mean() train_datas.loc[(row['vipno'], row['pluno']), feature_names[29]] = np.array(tmp).std() train_datas.loc[(row['vipno'], row['pluno']), feature_names[30]] = np.array(tmp).max() train_datas.loc[(row['vipno'], row['pluno']), feature_names[31]] = tmp[1] tmp = [] for m in months: tmp.append(row['I_U_month_penetration_' + m]) tmp.sort() train_datas.loc[(row['vipno'], row['pluno']), feature_names[32]] = np.array(tmp).mean() train_datas.loc[(row['vipno'], row['pluno']), feature_names[33]] = np.array(tmp).std() train_datas.loc[(row['vipno'], row['pluno']), feature_names[34]] = np.array(tmp).max() train_datas.loc[(row['vipno'], row['pluno']), feature_names[35]] = tmp[1] tmp = [] for m in months: tmp.append(row['U_I_month_diversity_' + m]) tmp.sort() train_datas.loc[(row['vipno'], row['pluno']), feature_names[36]] = np.array(tmp).mean() train_datas.loc[(row['vipno'], row['pluno']), feature_names[37]] = np.array(tmp).std() train_datas.loc[(row['vipno'], row['pluno']), feature_names[38]] = np.array(tmp).max() train_datas.loc[(row['vipno'], row['pluno']), feature_names[39]] = tmp[1] print(datetime.datetime.now() - start) print("***************") if not predict: start = datetime.datetime.now() labels = datas['U_I_month_qty_' + label_month].as_matrix().tolist() indexs = train_datas.index for label in labels: # 0代表空值 if label != 0: label = label.split('-') if (label[0], label[1]) in indexs: train_datas.loc[(label[0], label[1]), 'label'] = float(label[2]) print(datetime.datetime.now() - start) print("***************") return train_datas
image_type = np.random.binomial(1, 0.5, n_trials) trials = DataFrame(dict(image_type=image_type, timestamp=np.zeros(n_trials))) # Setup graphics def load_image(filename): return visual.ImageStim(win=mywin, image=filename) mywin = visual.Window([1920, 1080], monitor='testMonitor', units='deg', fullscr=True) faces = map(load_image, glob('stimulus_presentation/stim/face_house/faces/*_3.jpg')) houses = map(load_image, glob('stimulus_presentation/stim/face_house/houses/*.3.jpg')) for ii, trial in trials.iterrows(): # Intertrial interval core.wait(iti + np.random.rand() * jitter) # Select and display image label = trials['image_type'].iloc[ii] image = choice(faces if label == 1 else houses) image.draw() # Send marker timestamp = local_clock() outlet.push_sample([markernames[label]], timestamp) mywin.flip() # offset core.wait(soa)
def plot_view_stock(df: pd.DataFrame, symbol: str, interval: str): """ Plot the loaded stock dataframe Parameters ---------- df: Dataframe Dataframe of prices and volumnes symbol: str Symbol of ticker interval: str Stock data resolution for plotting purposes """ df.sort_index(ascending=True, inplace=True) bar_colors = [ "r" if x[1].Open < x[1].Close else "g" for x in df.iterrows() ] try: fig, ax = plt.subplots( 2, 1, gridspec_kw={"height_ratios": [3, 1]}, figsize=plot_autoscale(), dpi=cfgPlot.PLOT_DPI, ) except Exception as e: print(e) print( "Encountered an error trying to open a chart window. Check your X server configuration." ) return # In order to make nice Volume plot, make the bar width = interval if interval == "1440min": bar_width = timedelta(days=1) title_string = "Daily" else: bar_width = timedelta(minutes=int(interval.split("m")[0])) title_string = f"{int(interval.split('m')[0])} min" ax[0].yaxis.tick_right() if "Adj Close" in df.columns: ax[0].plot(df.index, df["Adj Close"], c=cfgPlot.VIEW_COLOR) else: ax[0].plot(df.index, df["Close"], c=cfgPlot.VIEW_COLOR) ax[0].set_xlim(df.index[0], df.index[-1]) ax[0].set_xticks([]) ax[0].yaxis.set_label_position("right") ax[0].set_ylabel("Share Price ($)") ax[0].grid(axis="y", color="gainsboro", linestyle="-", linewidth=0.5) ax[0].spines["top"].set_visible(False) ax[0].spines["left"].set_visible(False) ax[1].bar(df.index, df.Volume / 1_000_000, color=bar_colors, alpha=0.8, width=bar_width) ax[1].set_xlim(df.index[0], df.index[-1]) ax[1].yaxis.tick_right() ax[1].yaxis.set_label_position("right") ax[1].set_ylabel("Volume (1M)") ax[1].grid(axis="y", color="gainsboro", linestyle="-", linewidth=0.5) ax[1].spines["top"].set_visible(False) ax[1].spines["left"].set_visible(False) ax[1].set_xlabel("Time") fig.suptitle( symbol + " " + title_string, size=20, x=0.15, y=0.95, fontfamily="serif", fontstyle="italic", ) if gtff.USE_ION: plt.ion() fig.tight_layout(pad=2) plt.setp(ax[1].get_xticklabels(), rotation=20, horizontalalignment="right") plt.show() print("")
def load_terms(df: DataFrame, term_creation_mode: str = 'ignore') -> int: """Creates Term objects from an input Pandas DataFrame and adds the newly created Terms to the database. Args: df (pandas.DataFrame): Input data. Contains columns: * 'Term Locale' * 'Term Category' * 'Term' * 'Case Sensitive'. term_creation_mode (str): A logical flag for handling duplicate Term conflicts. Can be 'add', 'ignore', or 'replace'. Defaults to 'ignore'. Only relevant if a Term already exists in the database. * 'add': adds the new Terms while keeping the existing Terms. * 'ignore': skips adding new Terms. * 'replace': deletes the old Term(s) and then adds the new Term. Returns: int: The number of Terms added to the database. Raises: ValueError: invalid `term_creation_mode` argument. """ @unique class _TermCreationMode(Enum): ADD = 'add' IGNORE = 'ignore' REPLACE = 'replace' @classmethod def get_modes(self, *args) -> list: def _ga(mode, *args) -> tuple: return tuple([getattr(mode, arg) for arg in args]) if len(args) == 1: return [getattr(mode, *args) for mode in self] elif len(args) == 0: args = ('name', 'value') return [_ga(mode, args) for mode in self] def _prepare_term( dataframe_row: Series, term_creation_mode: _TermCreationMode) -> Union[Term, None]: """Instantiates and returns a new Term objects based on input data. Args: dataframe_row (pandas.Series): Input data for Term creation. term_creation_mode (_TermCreationMode): A logical flag; determines return behavior. Returns: new_db_term (Term): A new Term object from the input row. None: if term_creation_mode is IGNORE. """ term = dataframe_row['Term'].strip() qs_terms_of_this_term = Term.objects.filter(term=term) # instantiate a new Term new_db_term = Term(term=term, source=dataframe_row['Term Category'], definition_url=dataframe_row['Term Locale']) # handle term creation modes if qs_terms_of_this_term.exists(): if term_creation_mode == _TermCreationMode.REPLACE: qs_terms_of_this_term.delete() elif term_creation_mode == _TermCreationMode.IGNORE: return None return new_db_term try: # set term creation mode term_creation_mode = _TermCreationMode(term_creation_mode) except ValueError: # raise custom argument error raise ValueError( f"Argument `term_creation_mode` must be one of: {_TermCreationMode.get_modes('values')}" ) # preprocess DataFrame df.drop_duplicates(inplace=True) df.loc[df['Case Sensitive'] == False, 'Term'] = df.loc[df['Case Sensitive'] == False, 'Term'].str.lower() df = df.drop_duplicates(subset='Term').dropna(subset=['Term']) # create new Terms. Filter: if _prepare_term() returns None, it is not added to this list new_db_terms = list( filter(None, (_prepare_term(row, term_creation_mode) for _, row in df.iterrows()))) # cache "global" term stems step - should be cached here via model manager Term.objects.bulk_create(new_db_terms) return len(new_db_terms)
def process_movie_records(session, data: pd.DataFrame): """ Adds list of movie records to database from data input """ # Category lookup fiels movie_color_lookup = src.controller.fields.MovieColorIndexLookup( logger).query() country_lookup = src.controller.fields.CountryIndexLookup(logger).query() language_lookup = src.controller.fields.LanguageIndexLookup(logger).query() rating_lookup = src.controller.fields.ContentRatingIndexLookup( logger).query() person_lookup = src.controller.person.PersonIndexLookup(logger).query() # Lookup of movie record number by title+year movie_title_index = {} print('Updating movie records') for i, record in data.iterrows(): record_no = i + 1 if record_no % 500 == 0: print('\tProcessing record #%s' % record_no) # Get searchable title+year of movie record movie_title = record['movie_title'].strip() movie_title_l = movie_title.lower() movie_year = record['title_year'] if pd.isna(movie_title_l): logger.warning('Movie with no title on record #%s' % record_no) continue if pd.isna(movie_year): movie_year = '' else: movie_year = str(movie_year) movie_record_index = (movie_title_l, movie_year) if movie_record_index in movie_title_index: logger.warning( 'Duplicate movie "%s" (#%s, #%s)' % (movie_title, movie_title_index[movie_title_l], record_no)) continue else: # Mark movie by record number in dataframe movie_title_index[movie_title_l] = record_no # Get category ids by name movie_color_pk = lookup_category_id(record['color'], movie_color_lookup) country_pk = lookup_category_id(record['country'], country_lookup) language_pk = lookup_category_id(record['language'], language_lookup) rating_pk = lookup_category_id(record['content_rating'], rating_lookup) director_pk = lookup_category_id(record['director_name'], person_lookup) # Get movie's imdb id imdb_link = record['movie_imdb_link'] if imdb_link: search_results = IMDB_URL_ID_RE.search(imdb_link) if search_results is None: imdb_id = None else: imdb_id = search_results.group(1) else: imdb_id = None # Get movie's numerical stats aspect_ratio = src.utils.nan_to_none(record['aspect_ratio']) budget = src.utils.nan_to_none(record['budget']) cast_likes = src.utils.nan_to_none(record['cast_total_facebook_likes']) duration = src.utils.nan_to_none(record['duration']) facenum = src.utils.nan_to_none(record['facenumber_in_poster']) gross = src.utils.nan_to_none(record['gross']) imdb_score = src.utils.nan_to_none(record['imdb_score']) facebook_likes = src.utils.nan_to_none(record['movie_facebook_likes']) num_critic = src.utils.nan_to_none(record['num_critic_for_reviews']) num_user = src.utils.nan_to_none(record['num_user_for_reviews']) num_voted = src.utils.nan_to_none(record['num_voted_users']) # Add movie record. Manaully take over session and comitting src.controller.movie.AddMovie(logger=logger, session=session, commit_enabled=False).execute( movie_title=movie_title, title_year=movie_year, content_rating_pk=rating_pk, color_pk=movie_color_pk, country_pk=country_pk, director_pk=director_pk, language_pk=language_pk, aspect_ratio=aspect_ratio, budget=budget, cast_facebook_likes=cast_likes, duration=duration, facenum=facenum, gross=gross, imdb_id=imdb_id, imdb_score=imdb_score, movie_facebook_likes=facebook_likes, num_critic_for_reviews=num_critic, num_user_for_reviews=num_user, num_voted_users=num_voted) session.commit()
def df_to_rows(df: pd.DataFrame): return [ ui.table_row(str(row['ID']), [str(row[name]) for name in column_names]) for i, row in df.iterrows() ]
def present(duration=120, eeg=None, save_fn=None): n_trials = 2010 iti = 0.5 soa = 3.0 jitter = 0.2 record_duration = np.float32(duration) markernames = [1, 2] # Setup trial list stim_freq = np.random.binomial(1, 0.5, n_trials) trials = DataFrame(dict(stim_freq=stim_freq, timestamp=np.zeros(n_trials))) # Set up graphics mywin = visual.Window([1600, 900], monitor='testMonitor', units="deg", fullscr=True) grating = visual.GratingStim(win=mywin, mask='circle', size=80, sf=0.2) grating_neg = visual.GratingStim(win=mywin, mask='circle', size=80, sf=0.2, phase=0.5) fixation = visual.GratingStim(win=mywin, size=0.2, pos=[0, 0], sf=0.2, color=[1, 0, 0], autoDraw=True) # Generate the possible ssvep frequencies based on monitor refresh rate def get_possible_ssvep_freqs(frame_rate, stim_type='single'): """Get possible SSVEP stimulation frequencies. Utility function that returns the possible SSVEP stimulation frequencies and on/off pattern based on screen refresh rate. Args: frame_rate (float): screen frame rate, in Hz Keyword Args: stim_type (str): type of stimulation 'single'-> single graphic stimulus (the displayed object appears and disappears in the background.) 'reversal' -> pattern reversal stimulus (the displayed object appears and is replaced by its opposite.) Returns: (dict): keys are stimulation frequencies (in Hz), and values are lists of tuples, where each tuple is the number of (on, off) periods of one stimulation cycle For more info on stimulation patterns, see Section 2 of: Danhua Zhu, Jordi Bieger, Gary Garcia Molina, and Ronald M. Aarts, "A Survey of Stimulation Methods Used in SSVEP-Based BCIs," Computational Intelligence and Neuroscience, vol. 2010, 12 pages, 2010. """ max_period_nb = int(frame_rate / 6) periods = np.arange(max_period_nb) + 1 if stim_type == 'single': freqs = dict() for p1 in periods: for p2 in periods: f = frame_rate / (p1 + p2) try: freqs[f].append((p1, p2)) except: freqs[f] = [(p1, p2)] elif stim_type == 'reversal': freqs = {frame_rate / p: [(p, p)] for p in periods[::-1]} return freqs def init_flicker_stim(frame_rate, cycle, soa): """Initialize flickering stimulus. Get parameters for a flickering stimulus, based on the screen refresh rate and the desired stimulation cycle. Args: frame_rate (float): screen frame rate, in Hz cycle (tuple or int): if tuple (on, off), represents the number of 'on' periods and 'off' periods in one flickering cycle. This supposes a "single graphic" stimulus, where the displayed object appears and disappears in the background. If int, represents the number of total periods in one cycle. This supposes a "pattern reversal" stimulus, where the displayed object appears and is replaced by its opposite. soa (float): stimulus duration, in s Returns: (dict): dictionary with keys 'cycle' -> tuple of (on, off) periods in a cycle 'freq' -> stimulus frequency 'n_cycles' -> number of cycles in one stimulus trial """ if isinstance(cycle, tuple): stim_freq = frame_rate / sum(cycle) n_cycles = int(soa * stim_freq) else: stim_freq = frame_rate / cycle cycle = (cycle, cycle) n_cycles = int(soa * stim_freq) / 2 return {'cycle': cycle, 'freq': stim_freq, 'n_cycles': n_cycles} # Set up stimuli frame_rate = np.round(mywin.getActualFrameRate()) # Frame rate, in Hz freqs = get_possible_ssvep_freqs(frame_rate, stim_type='reversal') stim_patterns = [ init_flicker_stim(frame_rate, 2, soa), init_flicker_stim(frame_rate, 3, soa) ] print(('Flickering frequencies (Hz): {}\n'.format( [stim_patterns[0]['freq'], stim_patterns[1]['freq']]))) # start the EEG stream, will delay 5 seconds to let signal settle if eeg: if save_fn is None: # If no save_fn passed, generate a new unnamed save file save_fn = generate_save_fn(eeg.device_name, 'visual_ssvep', 'unnamed') print( f'No path for a save file was passed to the experiment. Saving data to {save_fn}' ) eeg.start(save_fn, duration=record_duration) # Iterate through trials start = time() for ii, trial in trials.iterrows(): # Intertrial interval core.wait(iti + np.random.rand() * jitter) # Select stimulus frequency ind = trials['stim_freq'].iloc[ii] # Push sample if eeg: timestamp = time() if eeg.backend == 'muselsl': marker = [markernames[ind]] else: marker = markernames[ind] eeg.push_sample(marker=marker, timestamp=timestamp) # Present flickering stim for _ in range(int(stim_patterns[ind]['n_cycles'])): grating.setAutoDraw(True) for _ in range(int(stim_patterns[ind]['cycle'][0])): mywin.flip() grating.setAutoDraw(False) grating_neg.setAutoDraw(True) for _ in range(stim_patterns[ind]['cycle'][1]): mywin.flip() grating_neg.setAutoDraw(False) # offset mywin.flip() if len(event.getKeys()) > 0 or (time() - start) > record_duration: break event.clearEvents() # Cleanup if eeg: eeg.stop() mywin.close()
def gen_portfolio(model, system, group, tframe, startcap=100000, posby='close'): r"""Create a portfolio from a trades frame. Parameters ---------- model : alphapy.Model The model with specifications. system : str Name of the system. group : alphapy.Group The group of instruments in the portfolio. tframe : pandas.DataFrame The input trade list from running the system. startcap : float Starting capital. posby : str The position sizing column in the price dataframe. Returns ------- p : alphapy.Portfolio The generated portfolio. Raises ------ MemoryError Could not allocate Portfolio. Notes ----- This function also generates the files required for analysis by the *pyfolio* package: * Returns File * Positions File * Transactions File """ logger.info("Creating Portfolio for System %s", system) # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Create the portfolio. gname = group.name gspace = group.space gmembers = group.members ff = 1.0 / len(gmembers) p = Portfolio(gname, system, gspace, startcap=startcap, posby=posby, restricted=False, fixedfrac=ff) if not p: raise MemoryError("Could not allocate Portfolio") # Build pyfolio data from the trades frame. start = tframe.index[0] end = tframe.index[-1] trange = np.unique( tframe.index.map(lambda x: x.date().strftime('%Y-%m-%d'))).tolist() drange = date_range(start, end).map(lambda x: x.date().strftime('%Y-%m-%d')) # Initialize return, position, and transaction data. rs = [] pcols = list(gmembers) pcols.extend(['cash']) pf = DataFrame(index=drange, columns=pcols).fillna(0.0) ts = [] # Iterate through the date range, updating the portfolio. for d in drange: # process today's trades if d in trange: trades = tframe.ix[d] if isinstance(trades, Series): trades = DataFrame(trades).transpose() for t in trades.iterrows(): tdate = t[0] row = t[1] tsize = exec_trade(p, row['name'], row['order'], row['quantity'], row['price'], tdate) if tsize != 0: ts.append((d, [tsize, row['price'], row['name']])) else: logger.info("Trade could not be executed for %s", row['name']) # iterate through current positions positions = p.positions pfrow = pf.ix[d] for key in positions: pos = positions[key] if pos.quantity > 0: value = pos.value else: value = -pos.value pfrow[pos.name] = value pfrow['cash'] = p.cash # update the portfolio returns p = valuate_portfolio(p, d) rs.append((d, [p.netreturn])) # Create systems directory path system_dir = SSEP.join([directory, 'systems']) # Create and record the returns frame for this system. logger.info("Recording Returns Frame") rspace = Space(system, 'returns', gspace.fractal) rf = DataFrame.from_items(rs, orient='index', columns=['return']) rfname = frame_name(gname, rspace) write_frame(rf, system_dir, rfname, extension, separator, index=True, index_label='date') del rspace # Record the positions frame for this system. logger.info("Recording Positions Frame") pspace = Space(system, 'positions', gspace.fractal) pfname = frame_name(gname, pspace) write_frame(pf, system_dir, pfname, extension, separator, index=True, index_label='date') del pspace # Create and record the transactions frame for this system. logger.info("Recording Transactions Frame") tspace = Space(system, 'transactions', gspace.fractal) tf = DataFrame.from_items(ts, orient='index', columns=['amount', 'price', 'symbol']) tfname = frame_name(gname, tspace) write_frame(tf, system_dir, tfname, extension, separator, index=True, index_label='date') del tspace # Return the portfolio. return p
def pandas_to_eland( pd_df: pd.DataFrame, es_client: Union[str, List[str], Tuple[str, ...], Elasticsearch], es_dest_index: str, es_if_exists: str = "fail", es_refresh: bool = False, es_dropna: bool = False, es_type_overrides: Optional[Mapping[str, str]] = None, chunksize: Optional[int] = None, use_pandas_index_for_es_ids: bool = True, ) -> DataFrame: """ Append a pandas DataFrame to an Elasticsearch index. Mainly used in testing. Modifies the elasticsearch destination index Parameters ---------- es_client: Elasticsearch client argument(s) - elasticsearch-py parameters or - elasticsearch-py instance es_dest_index: str Name of Elasticsearch index to be appended to es_if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the index already exists. - fail: Raise a ValueError. - replace: Delete the index before inserting new values. - append: Insert new values to the existing index. Create if does not exist. es_refresh: bool, default 'False' Refresh es_dest_index after bulk index es_dropna: bool, default 'False' * True: Remove missing values (see pandas.Series.dropna) * False: Include missing values - may cause bulk to fail es_type_overrides: dict, default None Dict of field_name: es_data_type that overrides default es data types chunksize: int, default None Number of pandas.DataFrame rows to read before bulk index into Elasticsearch use_pandas_index_for_es_ids: bool, default 'True' * True: pandas.DataFrame.index fields will be used to populate Elasticsearch '_id' fields. * False: Ignore pandas.DataFrame.index when indexing into Elasticsearch Returns ------- eland.Dataframe eland.DataFrame referencing data in destination_index Examples -------- >>> pd_df = pd.DataFrame(data={'A': 3.141, ... 'B': 1, ... 'C': 'foo', ... 'D': pd.Timestamp('20190102'), ... 'E': [1.0, 2.0, 3.0], ... 'F': False, ... 'G': [1, 2, 3], ... 'H': 'Long text - to be indexed as es type text'}, ... index=['0', '1', '2']) >>> type(pd_df) <class 'pandas.core.frame.DataFrame'> >>> pd_df A B ... G H 0 3.141 1 ... 1 Long text - to be indexed as es type text 1 3.141 1 ... 2 Long text - to be indexed as es type text 2 3.141 1 ... 3 Long text - to be indexed as es type text <BLANKLINE> [3 rows x 8 columns] >>> pd_df.dtypes A float64 B int64 C object D datetime64[ns] E float64 F bool G int64 H object dtype: object Convert `pandas.DataFrame` to `eland.DataFrame` - this creates an Elasticsearch index called `pandas_to_eland`. Overwrite existing Elasticsearch index if it exists `if_exists="replace"`, and sync index so it is readable on return `refresh=True` >>> ed_df = ed.pandas_to_eland(pd_df, ... 'localhost', ... 'pandas_to_eland', ... es_if_exists="replace", ... es_refresh=True, ... es_type_overrides={'H':'text'}) # index field 'H' as text not keyword >>> type(ed_df) <class 'eland.dataframe.DataFrame'> >>> ed_df A B ... G H 0 3.141 1 ... 1 Long text - to be indexed as es type text 1 3.141 1 ... 2 Long text - to be indexed as es type text 2 3.141 1 ... 3 Long text - to be indexed as es type text <BLANKLINE> [3 rows x 8 columns] >>> ed_df.dtypes A float64 B int64 C object D datetime64[ns] E float64 F bool G int64 H object dtype: object See Also -------- eland.read_es: Create an eland.Dataframe from an Elasticsearch index eland.eland_to_pandas: Create a pandas.Dataframe from eland.DataFrame """ if chunksize is None: chunksize = DEFAULT_CHUNK_SIZE mapping = FieldMappings._generate_es_mappings(pd_df, es_type_overrides) es_client = ensure_es_client(es_client) # If table exists, check if_exists parameter if es_client.indices.exists(index=es_dest_index): if es_if_exists == "fail": raise ValueError( f"Could not create the index [{es_dest_index}] because it " f"already exists. " f"Change the if_exists parameter to " f"'append' or 'replace' data.") elif es_if_exists == "replace": es_client.indices.delete(index=es_dest_index) es_client.indices.create(index=es_dest_index, body=mapping) # elif if_exists == "append": # TODO validate mapping are compatible else: es_client.indices.create(index=es_dest_index, body=mapping) # Now add data actions = [] n = 0 for row in pd_df.iterrows(): if es_dropna: values = row[1].dropna().to_dict() else: values = row[1].to_dict() if use_pandas_index_for_es_ids: # Use index as _id id = row[0] # Use integer as id field for repeatable results action = { "_index": es_dest_index, "_source": values, "_id": str(id) } else: action = {"_index": es_dest_index, "_source": values} actions.append(action) n = n + 1 if n % chunksize == 0: bulk(client=es_client, actions=actions, refresh=es_refresh) actions = [] bulk(client=es_client, actions=actions, refresh=es_refresh) return DataFrame(es_client, es_dest_index)
def plot_hit_grid( hit_table: pd.DataFrame, results_table: pd.DataFrame, sort_by: Union[str, List[str]] = 'n_hits', x_label: str = 'dot_name', cmap: str = 'inferno', ): """ Plot the hits Parameters: ------------ hit_table : pd.DataFrame the hit table output from calc_hit_counts(). Note that `calc_hit_counts()` returns an list of hit tables, so index to the hit table you wish to plot. results_table : pd.DataFrame the results table output from process_dir() from which to get the strip info sort_by : list[str] a list of hit_table columns to sort the dots (x axis) by. ['n_hits', 'dot_name'] sorts by number of hits and then the dot name x_label : str the results table column to use for the name of the dot on the x_label (e.g., dot_name for the name of the dot) cmap : str Name of the colormap to be used for the number of hits. The default value is 'inferno' See the matplotlib documation for details: https://matplotlib.org/3.1.1/tutorials/colors/colormaps.html """ hit_table.sort_values(by=sort_by, axis=0, inplace=True, ascending=False) unique_strip_ids = results_table.strip_id.unique() strip_ids = {strip_id: i for i, strip_id in enumerate(unique_strip_ids)} names = [] strip_id_indices = [] counts = [] for i, row in hit_table.iterrows(): name = row['dot_name'] count = row['n_hits'] dot_table = results_table.loc[results_table['dot_name'] == name] for j, dot_row in dot_table.iterrows(): if dot_row['pos_hit']: #names.append(dot_row[x_label]) names.append(dot_row['dot_name']) strip_id_indices.append(dot_row['strip_id']) counts.append(count) f, ax = plt.subplots(figsize=(50, 2)) sc = ax.scatter(names, strip_id_indices, c=counts, cmap=cmap) ax.tick_params(axis='x', labelsize=5, rotation=90) ax.set_xlabel('spot name') ax.set_ylabel('strip id') plt.draw() norm = Normalize(vmin=1, vmax=hit_table['n_hits'].max()) # for t in ax.get_xticklabels(): # tick_name = t.get_text() # dot_name = results_table.loc[results_table[x_label] == tick_name].dot_name.values[0] # n_hits = hit_table.loc[hit_table['dot_name'] == dot_name].n_hits.values[0] # c = sc.cmap(norm(n_hits)) # t.set_color(c) label_names = [] for t in ax.get_xticklabels(): dot_name = t.get_text() label_names.append(dot_name) n_hits = hit_table.loc[hit_table['dot_name'] == dot_name].n_hits.values[0] c = sc.cmap(norm(n_hits)) t.set_color(c) new_labels = [ results_table.loc[results_table['dot_name'] == n][x_label].values[0] for n in label_names ] ax.set_xticklabels(new_labels) plt.draw() return f, ax
def specshow(y: np.ndarray, sr: int, y_processed=None, tp: pd.DataFrame = None, fp: pd.DataFrame = None): plot_spectrogram = st.checkbox("Spectrogram plot") if plot_spectrogram: st.sidebar.markdown("#### Spectrogram plot settings") start_second = st.sidebar.number_input("start second", min_value=0, max_value=len(y) // sr, value=0, step=1, key="specshow_start") end_second = st.sidebar.number_input("end second", min_value=0, max_value=len(y) // sr, value=len(y) // sr, step=1, key="specshow_end") start_index = start_second * sr if end_second == len(y) // sr: end_index = len(y) else: end_index = end_second * sr y_plot = y[start_index:end_index] if y_processed is not None: y_plot_processed = y_processed[start_index:end_index] st.sidebar.markdown("##### (Mel)spectrogram parameters") mel = st.sidebar.checkbox("Mel scale", value=True) n_fft = st.sidebar.number_input("n_fft", min_value=64, max_value=8192, value=1024, step=64) hop_length = st.sidebar.number_input("hop_length", min_value=1, max_value=2048, value=320, step=10) if mel: n_mels = st.sidebar.number_input("n_mels", min_value=1, max_value=512, value=64, step=16) fmin = st.sidebar.number_input("fmin", min_value=1, max_value=8192, value=20, step=100) fmax = st.sidebar.number_input("fmax", min_value=4000, max_value=44100, value=14000, step=100) log = st.sidebar.checkbox("apply log", value=True) if mel: melspec_params = { "n_fft": n_fft, "hop_length": hop_length, "n_mels": n_mels, "fmin": fmin, "fmax": fmax, "sr": sr } else: spec_params = {"n_fft": n_fft, "hop_length": hop_length} if st.button("Show melspectrogram"): with st.spinner("Calculating melspectrogram"): if mel: spec = melspectrogram(y_plot, melspec_params, log) else: spec = spectrogram(y_plot, spec_params, log) if y_processed is not None: if mel: spec_processed = melspectrogram( y_plot_processed, melspec_params, log) else: spec_processed = spectrogram(y_plot_processed, spec_params, log) height, width = spec.shape st.write(f"{height} x {width} matrix") if y_processed is not None: with st.spinner("Plotting"): fig = plt.figure(figsize=(12, 8)) ax1 = fig.add_subplot(2, 1, 1) if mel: display.specshow(spec, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", fmin=fmin, fmax=fmax, ax=ax1) else: display.specshow(spec, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear", ax=ax1) ax2 = fig.add_subplot(2, 1, 2) if mel: display.specshow(spec_processed, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", fmin=fmin, fmax=fmax, ax=ax2) else: display.specshow(spec_processed, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear", ax=ax2) else: with st.spinner("Plotting"): fig = plt.figure(figsize=(12, 4)) ax = plt.axes() if mel: display.specshow(spec, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", fmin=fmin, fmax=fmax) plt.colorbar() else: display.specshow(spec, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear") plt.colorbar() if tp is not None and len(tp) > 0: for _, row in tp.iterrows(): rect = patches.Rectangle( (row["t_min"], row["f_min"]), row["t_max"] - row["t_min"], row["f_max"] - row["f_min"], linewidth=1, edgecolor="g", facecolor="g", alpha=0.5, label="tp") ax.add_patch(rect) if fp is not None and len(fp) > 0: for _, row in fp.iterrows(): rect = patches.Rectangle( (row["t_min"], row["f_min"]), row["t_max"] - row["t_min"], row["f_max"] - row["f_min"], linewidth=1, edgecolor="r", facecolor="r", alpha=0.5, label="fp") ax.add_patch(rect) st.pyplot(fig)
def _(data: pd.DataFrame, obs_col: str, ioc_type_col: Optional[str] = None): for _, row in data.iterrows(): if ioc_type_col is None: yield row[obs_col], TIProvider.resolve_ioc_type(row[obs_col]) else: yield row[obs_col], row[ioc_type_col]
def create_df(dataframe: pd.DataFrame) -> pd.DataFrame: # get lengths of signals for each sample lengths = [] width = dataframe.shape[1] for row in dataframe.index.tolist(): temp_width = width for item in dataframe.loc[row][::-1]: if not pd.isna(item) and isinstance(item, float): temp_width -= 1 break temp_width -= 1 lengths.append(temp_width) """ README For the following features we measured: [mean, median, 5 % percentile, 95 % percentile, standard deviation] R-peak location were retrieved by nk.ecg_peaks Q-peak and S-location were retrieved by nk.ecg_delineate ?_ampl_* ?-Peak amplitude ?_nr_peaks number of ?-Peaks ?_diff_* Interval between ?-Peaks QRS_diff_* QRS duration len_* length of signal Qual_* quality of signal measured with nk.ecg_quality sign_* signal Also the output from nk.hrv_time which contains different measurements for the heart rate variation (HRV*) was added Additionally one 'typical' heartbeat was greated (all length 180): MN_* mean signal MD_* median signal P5_* 5 % percentile signal P95_* 95 % percentile signal SD_* standard deviation of signal """ names = ['R_ampl_mean', 'R_ampl_median', 'R_ampl_perc5', 'R_ampl_perc95', 'R_ampl_sd', 'R_nr_peaks', 'len_mean', 'len_median', 'len_perc5', 'len_perc95', 'len_sd', 'sign_mean', 'sign_median', 'sign_perc5', 'sign_perc95', 'sign_sd', 'Qual_mean', 'Qual_median', 'Qual_perc5', 'Qual_perc95', 'Qual_sd', 'Q_ampl_mean', 'Q_ampl_median', 'Q_ampl_perc5', 'Q_ampl_perc95', 'Q_ampl_sd', 'Q_nr_peaks', 'Q_diff_mean', 'Q_diff_median', 'Q_diff_perc5', 'Q_diff_perc95', 'Q_diff_sd', 'S_ampl_mean', 'S_ampl_median', 'S_ampl_perc5', 'S_ampl_perc95', 'S_ampl_sd', 'S_nr_peaks', 'S_diff_mean', 'S_diff_median', 'S_diff_perc5', 'S_diff_perc95', 'S_diff_sd', 'P_ampl_mean', 'P_ampl_median', 'P_ampl_perc5', 'P_ampl_perc95', 'P_ampl_sd', 'P_nr_peaks', 'T_ampl_mean', 'T_ampl_median', 'T_ampl_perc5', 'T_ampl_perc95', 'T_ampl_sd', 'T_nr_peaks', 'QRS_diff_mean', 'QRS_diff_median', 'QRS_diff_perc5', 'QRS_diff_perc95', 'QRS_diff_sd', 'PR_diff_mean', 'PR_diff_median', 'PR_diff_perc5', 'PR_diff_perc95', 'PR_diff_sd', 'RT_diff_mean', 'RT_diff_median', 'RT_diff_perc5', 'RT_diff_perc95', 'RT_diff_sd', 'HRV_RMSSD', 'HRV_MeanNN', 'HRV_SDNN', 'HRV_SDSD', 'HRV_CVNN', 'HRV_CVSD', 'HRV_MedianNN', 'HRV_MadNN', 'HRV_MCVNN', 'HRV_IQRNN', 'HRV_pNN50', 'HRV_pNN20', 'HRV_TINN', 'HRV_HTI', 'HRV_ULF','HRV_VLF','HRV_LF','HRV_HF','HRV_VHF','HRV_LFHF','HRV_LFn','HRV_HFn', 'HRV_LnHF', 'HRV_SD1','HRV_SD2', 'HRV_SD1SD2','HRV_S','HRV_CSI','HRV_CVI','HRV_CSI_Modified', 'HRV_PIP', 'HRV_IALS','HRV_PSS','HRV_PAS','HRV_GI','HRV_SI','HRV_AI','HRV_PI','HRV_C1d','HRV_C1a','HRV_SD1d', 'HRV_SD1a','HRV_C2d','HRV_C2a','HRV_SD2d','HRV_SD2a','HRV_Cd','HRV_Ca','HRV_SDNNd','HRV_SDNNa','HRV_ApEn', 'HRV_SampEn','J_LF','J_HF','J_L/H'] template_len = 180 mean_names = ['MN_' + str(index) for index in range(template_len)] median_names = ['MD_' + str(index) for index in range(template_len)] perc5_names = ['P5_' + str(index) for index in range(template_len)] perc95_names = ['P95_' + str(index) for index in range(template_len)] sd_names = ['SD_' + str(index) for index in range(template_len)] wavelet = 'db3' wl_len = int(np.floor((template_len + pywt.Wavelet(wavelet).dec_len - 1) / 2)) wl_mean_names = ['WLMN_' + str(index) for index in range(2*wl_len)] wl_median_names = ['WLMD_' + str(index) for index in range(2*wl_len)] wl_perc5_names = ['WLP5_' + str(index) for index in range(2*wl_len)] wl_perc95_names = ['WLP95_' + str(index) for index in range(2*wl_len)] wl_sd_names = ['WLSD_' + str(index) for index in range(2*wl_len)] typical_signal_names = mean_names + median_names + perc5_names + perc95_names + sd_names + wl_mean_names + \ wl_median_names + wl_perc5_names + wl_perc95_names + wl_sd_names names += typical_signal_names data = np.empty([dataframe.shape[0], len(names)]) iteration = 0 for row_index, row in dataframe.iterrows(): print(row_index) # Retrieve ECG data ecg_signal = row[:lengths[iteration] + 1] ecg_signal = nk.ecg_clean(ecg_signal, sampling_rate=SAMPLING_RATE) # Find R-peaks peaks, info = nk.ecg_peaks(ecg_signal, sampling_rate=SAMPLING_RATE) # R amplitude R_amplitudes = ecg_signal[info['ECG_R_Peaks']] # Check if the signal is flipped # Check if we have enough peaks to retrieve more information if len(R_amplitudes) > 4: _, waves_peak = nk.ecg_delineate(ecg_signal, info, sampling_rate=300, show=False) # Q amplitude # remove nan values Q_amplitudes = [ecg_signal[peak_index] if str(peak_index) != 'nan' else - np.infty for peak_index in waves_peak['ECG_Q_Peaks']] if np.sum([1 if np.abs(rpeak) > np.abs(Q_amplitudes[index]) else -1 for index, rpeak in enumerate(R_amplitudes)]) < 0: print("flip", row_index) ecg_signal = -ecg_signal peaks, info = nk.ecg_peaks(ecg_signal, sampling_rate=300) # R amplitude R_amplitudes = ecg_signal[info['ECG_R_Peaks']] if len(R_amplitudes) > 4: _, waves_peak = nk.ecg_delineate(ecg_signal, info, sampling_rate=300, show=False) data_temp = [] if len(R_amplitudes) > 0: data_temp = [np.mean(R_amplitudes), np.median(R_amplitudes), np.percentile(R_amplitudes, q=5), np.percentile(R_amplitudes, q=95), np.std(R_amplitudes), len(R_amplitudes)] else: empty = np.empty([6]) empty[:] = np.NaN data_temp += empty.tolist() # length of signal data_new = [np.mean(lengths[iteration] / SAMPLING_RATE), np.median(lengths[iteration] / SAMPLING_RATE), np.percentile(lengths[iteration] / SAMPLING_RATE, q=5), np.percentile(lengths[iteration] / SAMPLING_RATE, q=95), np.std(lengths[iteration] / SAMPLING_RATE)] data_temp += data_new # signal data_new = [np.mean(ecg_signal), np.median(ecg_signal), np.percentile(ecg_signal, q=5), np.percentile(ecg_signal, q=95), np.std(ecg_signal)] data_temp += data_new # Check if we have enough peaks to retrieve more information if len(R_amplitudes) > 4: quality = nk.ecg_quality(ecg_signal, sampling_rate=SAMPLING_RATE) data_new = [np.mean(quality), np.median(quality), np.percentile(quality, q=5), np.percentile(quality, q=95), np.std(quality)] data_temp += data_new # Delineate the ECG signal # “ECG_P_Peaks”, “ECG_Q_Peaks”, “ECG_S_Peaks”, “ECG_T_Peaks”, “ECG_P_Onsets”, “ECG_T_Offsets” # _, waves_peak = nk.ecg_delineate(ecg_signal, info, sampling_rate=SAMPLING_RATE, show=False) # Q amplitude # remove nan values Q_peaks = [peak for peak in waves_peak['ECG_Q_Peaks'] if str(peak) != 'nan'] if len(Q_peaks) > 0: Q_amplitudes = ecg_signal[Q_peaks] data_new = [np.mean(Q_amplitudes), np.median(Q_amplitudes), np.percentile(Q_amplitudes, q=5), np.percentile(Q_amplitudes, q=95), np.std(Q_amplitudes), len(Q_amplitudes)] data_temp += data_new else: empty = np.empty([6]) empty[:] = np.NaN empty[5] = 0 data_temp += empty.tolist() # more than 1 Q-Peak => can build interval[s] if len(Q_peaks) > 1: Q_peaks_diff = [(Q_peaks[index + 1] - Q_peaks[index]) / SAMPLING_RATE for index, item in enumerate(Q_peaks[:len(Q_peaks) - 1])] # QQ interval data_new = [np.mean(Q_peaks_diff), np.median(Q_peaks_diff), np.percentile(Q_peaks_diff, q=5), np.percentile(Q_peaks_diff, q=95), np.std(Q_peaks_diff)] data_temp += data_new # 0 or 1 Q-peak = no interval => return nan else: empty = np.empty([5]) empty[:] = np.NaN data_temp += empty.tolist() # S amplitude # remove nan values S_peaks = [peak for peak in waves_peak['ECG_S_Peaks'] if str(peak) != 'nan'] if len(S_peaks) > 0: S_amplitudes = ecg_signal[S_peaks] data_new = [np.mean(S_amplitudes), np.median(S_amplitudes), np.percentile(S_amplitudes, q=5), np.percentile(S_amplitudes, q=95), np.std(S_amplitudes), len(S_amplitudes)] data_temp += data_new else: empty = np.empty([6]) empty[:] = np.NaN empty[5] = 0 data_temp += empty.tolist() # more than one S-peak if len(S_peaks) > 1: S_peaks_diff = [(S_peaks[index + 1] - S_peaks[index]) / SAMPLING_RATE for index, item in enumerate(S_peaks[:len(S_peaks) - 1])] # SS interval data_new = [np.mean(S_peaks_diff), np.median(S_peaks_diff), np.percentile(S_peaks_diff, q=5), np.percentile(S_peaks_diff, q=95), np.std(S_peaks_diff)] data_temp += data_new # 0 or 1 S-peak = no interval => return nan else: empty = np.empty([5]) empty[:] = np.NaN data_temp += empty.tolist() P_peaks = [peak for peak in waves_peak['ECG_P_Peaks'] if str(peak) != 'nan'] if len(P_peaks) > 0: P_amplitudes = ecg_signal[P_peaks] data_new = [np.mean(P_amplitudes), np.median(P_amplitudes), np.percentile(P_amplitudes, q=5), np.percentile(P_amplitudes, q=95), np.std(P_amplitudes), len(P_amplitudes)] data_temp += data_new else: empty = np.empty([6]) empty[:] = np.NaN empty[5] = 0 data_temp += empty.tolist() T_peaks = [peak for peak in waves_peak['ECG_T_Peaks'] if str(peak) != 'nan'] if len(T_peaks) > 0: T_peaks = ecg_signal[T_peaks] data_new = [np.mean(T_peaks), np.median(T_peaks), np.percentile(T_peaks, q=5), np.percentile(T_peaks, q=95), np.std(T_peaks), len(T_peaks)] data_temp += data_new else: empty = np.empty([6]) empty[:] = np.NaN empty[5] = 0 data_temp += empty.tolist() # QRS interval QRS_peaks_diff = [] # compute difference between Q and S peak for index in range(len(waves_peak['ECG_Q_Peaks'])): if not (np.isnan(waves_peak['ECG_Q_Peaks'][index]) or np.isnan(waves_peak['ECG_S_Peaks'][index])): QRS_peaks_diff.append( (waves_peak['ECG_S_Peaks'][index] - waves_peak['ECG_Q_Peaks'][index]) / SAMPLING_RATE) if len(QRS_peaks_diff) > 0: data_new = [np.mean(QRS_peaks_diff), np.median(QRS_peaks_diff), np.percentile(QRS_peaks_diff, q=5), np.percentile(QRS_peaks_diff, q=95), np.std(QRS_peaks_diff)] data_temp += data_new else: empty = np.empty([5]) empty[:] = np.NaN data_temp += empty.tolist() # PR interval PR_peaks_diff = [] # compute difference between P and R peak for index in range(len(waves_peak['ECG_P_Peaks'])): if not np.isnan(waves_peak['ECG_P_Peaks'][index]): PR_peaks_diff.append( (info['ECG_R_Peaks'][index] - waves_peak['ECG_P_Peaks'][index]) / SAMPLING_RATE) if len(PR_peaks_diff) > 0: data_new = [np.mean(PR_peaks_diff), np.median(PR_peaks_diff), np.percentile(PR_peaks_diff, q=5), np.percentile(PR_peaks_diff, q=95), np.std(PR_peaks_diff)] data_temp += data_new else: empty = np.empty([5]) empty[:] = np.NaN data_temp += empty.tolist() # RT interval RT_peaks_diff = [] # compute difference between P and R peak for index in range(len(waves_peak['ECG_T_Peaks'])): if not np.isnan(waves_peak['ECG_T_Peaks'][index]): RT_peaks_diff.append( (waves_peak['ECG_T_Peaks'][index] - info['ECG_R_Peaks'][index]) / SAMPLING_RATE) if len(RT_peaks_diff) > 0: data_new = [np.mean(RT_peaks_diff), np.median(PR_peaks_diff), np.percentile(RT_peaks_diff, q=5), np.percentile(RT_peaks_diff, q=95), np.std(RT_peaks_diff)] data_temp += data_new else: empty = np.empty([5]) empty[:] = np.NaN data_temp += empty.tolist() # Extract clean EDA and SCR features # explanation of features: # https://neurokit2.readthedocs.io/en/latest/functions.html?highlight=hrv%20time#neurokit2.hrv.hrv_time hrv_time = nk.hrv(peaks, sampling_rate=SAMPLING_RATE, show=False) data_new = hrv_time.values.tolist()[0] data_temp += data_new # Jannik # http://www.paulvangent.com/2016/03/21/analyzing-a-discrete-heart-rate-signal-using-python-part-2/ rpeaks = info['ECG_R_Peaks'] r_interval = [rpeaks[index+1]-rpeaks[index] for index in range(len(rpeaks)-1)] RR_x_new = np.linspace(rpeaks[0],rpeaks[-2],rpeaks[-2]) f = interp1d(rpeaks[:-1], r_interval, kind='cubic') n = lengths[iteration] + 1 # Length of the signal frq = np.fft.fftfreq(n, d=(1 / SAMPLING_RATE)) # divide the bins into frequency categories frq = frq[range(int(n/2))] # Get single side of the frequency range Y = np.fft.fft(f(RR_x_new))/n # Calculate FFT try: Y = Y[range(int(n / 2))] lf = np.trapz(abs(Y[(frq >= 0.04) & (frq <= 0.15)])) hf = np.trapz(abs(Y[(frq >= 0.16) & (frq <= 0.5)])) # Do the same for 0.16-0.5Hz (HF) data_new = [lf, hf, lf / hf] data_temp += data_new except IndexError as err: print(err) data_temp += [None, None, None] # if we don't have enough R peaks return vector of nan's else: empty = np.empty([len(names) - 16 - len(typical_signal_names)]) empty[:] = np.NaN data_temp += empty.tolist() # Create a 'typical' heartbeat # Scaler = StandardScaler() # ecg_signal = Scaler.fit_transform(X=ecg_signal.reshape(-1, 1)).reshape(1, -1)[0].tolist() out = ecg.ecg(signal=ecg_signal, sampling_rate=SAMPLING_RATE, show=False) mean = np.mean(out['templates'], axis=0) median = np.median(out['templates'], axis=0) perc5 = np.percentile(out['templates'].astype(np.float64), axis=0, q=5) perc95 = np.percentile(out['templates'].astype(np.float64), axis=0, q=95) std = np.std(out['templates'].astype(np.float64), axis=0) data_new = np.concatenate((mean, median, perc5, perc95, std)).tolist() data_temp += data_new (wl_mean_cA, wl_mean_cD) = pywt.dwt(np.mean(out['templates'], axis=0), 'db3', 'periodic') (wl_median_cA, wl_median_cD) = pywt.dwt(np.median(out['templates'], axis=0), 'db3', 'periodic') (wl_perc5_cA, wl_perc5_cD) = pywt.dwt(np.percentile(out['templates'].astype(np.float64), axis=0, q=5), 'db3', 'periodic') (wl_perc95_cA, wl_perc95_cD) = pywt.dwt(np.percentile(out['templates'].astype(np.float64), axis=0, q=95), 'db3', 'periodic') (wl_sd_cA, wl_sd_cD) = pywt.dwt(np.std(out['templates'].astype(np.float64), axis=0), 'db3', 'periodic') data_new = np.concatenate((wl_mean_cA, wl_mean_cD, wl_median_cA, wl_median_cD, wl_perc5_cA, wl_perc5_cD, wl_perc95_cA, wl_perc95_cD, wl_sd_cA, wl_sd_cD)).tolist() data_temp += data_new data[iteration] = data_temp iteration += 1 features = pd.DataFrame(data, columns=names) return features
def transform(self, X, **transform_params): data = DataFrame(X[['completed', 'completed_post']]) result = [] for index, row in data.iterrows(): result.append(self.GetClass(row)) return DataFrame(result)
def postprocess_predictions(df: pd.DataFrame, opt: argparse.Namespace) -> pd.DataFrame: ''' input: + df: input pandas dataframe. + opt: configuration. output: postprocessed pandas dataframe. ''' post_predictions = [] if opt.has_label: list_of_important_tags = [] if opt.use_multiprocessing: import multiprocessing as mp # Apply a patch for the multiprocessing module import multiprocessing.pool as mpp from magneto.utils import istarmap mpp.Pool.istarmap = istarmap all_rows = [row for idx, row in df.iterrows()] inputs = list( zip(all_rows, [copy.deepcopy(opt) for _ in range(len(df))])) with mp.Pool(opt.num_workers) as pool: for result in tqdm(pool.istarmap(postprocess_prediction, inputs), total=len(inputs)): if opt.has_label: important_tags, post_prediction = result list_of_important_tags.append(important_tags) else: post_prediction = result post_predictions.append(post_prediction) else: for idx, row in tqdm(list(df.iterrows())): if opt.has_label: important_tags, post_prediction = postprocess_prediction( row, opt) list_of_important_tags.append(important_tags) else: post_prediction = postprocess_prediction(row, opt) post_predictions.append(post_prediction) list_of_pred_tags = [] list_of_probs = [] for post_prediction in post_predictions: post_prediction = list(zip(*post_prediction)) if len(post_prediction) >= 2: # TODO we will take care of masks later. pred_tags, probs = post_prediction[0], post_prediction[1] list_of_pred_tags.append('\n'.join(pred_tags)) probs = np.round(probs, decimals=3) probs = np.array(probs, dtype=str) list_of_probs.append('\n'.join(probs)) else: list_of_pred_tags.append('') list_of_probs.append('') df['pred_tags'] = list_of_pred_tags df['probs'] = list_of_probs if opt.has_label: list_of_important_tags = list( map(lambda x: '\n'.join(x), list_of_important_tags)) df['important_tags'] = list_of_important_tags return df
'Server=borismsdn.database.windows.net;' 'Database=DemoData;' 'uid=readbot;pwd=xxxxxxx') cursor = conn.cursor() #How many years of data should be loaded for the new airport(initial load) years_load_for_new_loc = 1 #loading a dataframe that contains all needed airports, start and end dates for loading deltas (or initial load in case of new airports) cur = cursor.execute( 'select l.airportcode, isnull(max_date, DateAdd(yy, -?, GetDate())) as max_date, DateAdd(dd, -1, GetDate()) as end_date from dbo.Locations l left join (select airportcode, max(date_utc) as max_date from dbo.Weather group by airportcode) w on l.airportcode=w.airportcode', (years_load_for_new_loc, )) df = DataFrame(cur.fetchall()) conn.close() print(df) #Parsing data for each airport for index, row in df.iterrows(): try: city = row[0] print("Parsing data for " + city) start_date = row[1] print("Start date: " + str(start_date)) end_date = row[2] print("End date: " + str(end_date)) df = [] #Parsing each link based on date for single_date in daterange(start_date, end_date): datec = single_date.strftime("%Y/%m/%d") print(datec) url = "https://www.wunderground.com/history/airport/" + city + "/" + datec + "/DailyHistory.html?req_city=&req_statename=&MR=1&format=1" #some empty values were displayed as -9999, remove them. also last column had <br /> in the end of each row, didn't find other way to remove it cur_df = pd.read_table(url, delimiter=',',
df.drop('index', axis=1, inplace=True) return df linalg = np.linalg np.random.seed(8) numOfRows = 20000 numOfSensors = 200 numOfClusters = 20 start_sensor = 48 x = np.random.normal(size=numOfSensors) y = np.random.normal(size=numOfSensors) map = DataFrame(dict(longitude=x, latitude=y, index=range(0, numOfSensors))) distance_list = [] map.drop('index', inplace=True, axis=1) for index, row in map.iterrows(): distances = pd.DataFrame({}, columns=['sensor', 'distance']) curr_lat = row['latitude'] curr_long = row['longitude'] for index1, row1 in map.iterrows(): row_toAdd = pd.Series( { 'sensor': index1, 'distance': distance(row['latitude'], row['longitude'], row1['latitude'], row1['longitude']) }, name=index1) distances = distances.append(row_toAdd) distances = distances.sort_values(by=['distance'])
def present(record_duration=120, stim_types=None, itis=None, additional_labels={}, secs=0.07, volume=0.8, eeg=None, save_fn=None): markernames = [1, 2] record_duration = np.float32(record_duration) ## Initialize stimuli #aud1 = sound.Sound('C', octave=5, sampleRate=44100, secs=secs) aud1 = sound.Sound(440, secs=secs) #, octave=5, sampleRate=44100, secs=secs) aud1.setVolume(volume) #aud2 = sound.Sound('D', octave=6, sampleRate=44100, secs=secs) aud2 = sound.Sound(528, secs=secs) aud2.setVolume(volume) auds = [aud1, aud2] # Setup trial list trials = DataFrame(dict(sound_ind=stim_types, iti=itis)) for col_name, col_vec in additional_labels.items(): trials[col_name] = col_vec # Setup graphics mywin = visual.Window([1920, 1080], monitor='testMonitor', units='deg', fullscr=True) fixation = visual.GratingStim(win=mywin, size=0.2, pos=[0, 0], sf=0, rgb=[1, 0, 0]) fixation.setAutoDraw(True) mywin.flip() iteratorthing = 0 # start the EEG stream, will delay 5 seconds to let signal settle if eeg: if save_fn is None: # If no save_fn passed, generate a new unnamed save file save_fn = generate_save_fn(eeg.device_name, 'auditoryaMMN', 'unnamed') print( f'No path for a save file was passed to the experiment. Saving data to {save_fn}' ) eeg.start(save_fn, duration=record_duration) show_instructions(10) # Start EEG Stream, wait for signal to settle, and then pull timestamp for start point start = time() # Iterate through the events for ii, trial in trials.iterrows(): iteratorthing = iteratorthing + 1 # Inter trial interval core.wait(trial['iti']) # Select and display image ind = int(trial['sound_ind']) auds[ind].stop() auds[ind].play() # Push sample if eeg: timestamp = time() if eeg.backend == 'muselsl': marker = [additional_labels['labels'][iteratorthing - 1]] marker = list(map(int, marker)) else: marker = additional_labels['labels'][iteratorthing - 1] eeg.push_sample(marker=marker, timestamp=timestamp) mywin.flip() mywin.flip() if len(event.getKeys()) > 0: break if (time() - start) > record_duration: break event.clearEvents() if iteratorthing == 1798: sleep(10) # Cleanup if eeg: eeg.stop() mywin.close()