def clean_file(filename): print("{0}/{1}: {2}".format( files.index(filename) + 1, len(files), filename)) with open(filename, 'r') as f: lines = f.readlines() all_data = [] for line in lines: data = json.loads(line) data_clean = {} data_clean['c'] = data['countrycode'] data_clean['w'] = data['word'] data_clean['s_id'] = int(data['key_id']) data_clean['r'] = data['recognized'] data_clean['d'] = [] for drawing in data['drawing']: drawing = np.array(drawing) drawing[:, 1:] = drawing[:, 1:] - drawing[:, :-1] data_clean['d'].append(drawing.tolist()) all_data.append(data_clean) df = DataFrame(all_data) new_filename = (os.path.split(filename)[1].split('.')[0] + '.p') new_filename = new_filename.replace(' ', '_').lower() df.to_pickle(os.path.join(DATA_DIR, 'clean', new_filename))
def load(data_frame: DataFrame, file_path: str) -> None: """Load (save) the data to the file system. :param data_frame: DataFrame to write. :param file_path: File to write to. """ os.makedirs(os.path.dirname(file_path), exist_ok=True) data_frame.to_pickle(file_path, compression='gzip')
def save_weather_df(weather_df: pd.DataFrame, output_proc_file: str = None): if output_proc_file is None: output_proc_file = 'weather_data.pkl' path_to_output = os.path.join(PROC_DATA_DIR, output_proc_file) logging.info( "saving weather dataframe as pickle in {f}".format(f=output_proc_file)) weather_df.to_pickle(path_to_output)
def _write_df(df: pd.DataFrame, fpath: str, **kwargs): print("Writing dataframe '{}'".format(fpath)) ext = os.path.splitext(fpath)[-1] if ext == ".h5": info = create_file_info(fpath) store = pd.HDFStore(fpath) try: store.put(HDF_NAMESPACE, df, format='table', data_columns=list(df.columns)) except IndexError: print( f"WARNING: Somehow this file could not be saved in a clean way. Trying dirty way." ) store.put(HDF_NAMESPACE, df, format='table') try: store.get_storer(HDF_NAMESPACE).attrs.metadata = info except KeyError: print(f"WARNING: You might write empty data to {fpath}.") store.close() return if ext == ".csv": df.to_csv(fpath, **kwargs) return if ext == ".pickle": df.to_pickle(fpath, **kwargs) return if ext == "": raise Exception("No file extension was provided.") raise Exception("No writer for: " + ext)
def test_to_csv_with_dst_transitions_with_pickle(self): # GH11619 idx = date_range("2015-01-01", "2015-12-31", freq="H", tz="Europe/Paris") idx = idx._with_freq(None) # freq does not round-trip idx._data._freq = None # otherwise there is trouble on unpickle df = DataFrame({"values": 1, "idx": idx}, index=idx) with tm.ensure_clean("csv_date_format_with_dst") as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) result.index = to_datetime(result.index, utc=True).tz_convert("Europe/Paris") result["idx"] = to_datetime( result["idx"], utc=True).astype("datetime64[ns, Europe/Paris]") tm.assert_frame_equal(result, df) # assert working df.astype(str) with tm.ensure_clean("csv_date_format_with_dst") as path: df.to_pickle(path) result = pd.read_pickle(path) tm.assert_frame_equal(result, df)
def upload_dataframe(self, df: pd.DataFrame, blob_path:str): temp_path = 'df.pkl' try: df.to_pickle('df.pkl') self.upload_blob(blob_path, temp_path) finally: os.remove(temp_path)
def __to_pkl(data: pd.DataFrame, path_to_pkl_base: str, path_to_pkl_augmented: str, pid: str): data.to_pickle(os.path.join(path_to_pkl_base, pid + ".pkl")) for i, chunk in enumerate(np.array_split(data, 4)): chunk.to_pickle( os.path.join(path_to_pkl_augmented, pid + "-" + str(i) + ".pkl"))
def calculate_routeplan(graph, point_from, point_to, path_weight_func='avg_cost'): # find all path from all nodes to all nodes and save result in table with summary # таблица путей по нарпвлениям pairs = [pair for pair in product(point_from, point_to)] pairs_matrix = [] for pair in pairs: try: # TODO: think about find all shortest path and choice optimal from all variable table # TODO: think about find alternative path with limit by capcity on edges path = dijkstra_path(graph, pair[0], pair[1], weight=path_weight_func) row = [] row.append(pair) row.append(path) row.append(path_amount(graph, path, 'time')) row.append(path_amount(graph, path, 'dist')) #row.append(path_amount(graph, path, 'cost')) pairs_matrix.append(row) except Exception as e: print(e) route_plan = DataFrame(pairs_matrix, columns=['pair', 'path', 'path_time', 'path_dist']) route_plan['edges'] = route_plan['path'].apply(edges_from_path) route_plan[['from', 'to']] = route_plan.pair.apply(lambda row: Series(row)) logging.info('Route path found. Start saving...') # план направлений date = datetime.today().strftime('%Y-%m-%d_%H:%M') path = '../result/' file_name = path + 'route_table_' + date route_plan.to_pickle(file_name) return route_plan
def generate_metafile(self, metafile_path): DATABASE_DIR = self.dataset_dir IMAGE_META = 'Data/AllImages_release.mat' MOS_META = 'Data/AllMOS_release.mat' STD_META = 'Data/AllStdDev_release.mat' img_names = loadmat(join(DATABASE_DIR, IMAGE_META))['AllImages_release'] img_names = list(map(lambda_0, img_names)) img_types = list(map(lambda_1, img_names)) img_pathes = [ join('Images', item) if item[0] != 't' else join( 'Images', 'trainingImages', item) for item in img_names ] img_dummy_refs = [ 'dummy_' + ''.join(item.split('.')[:-1]) for item in img_names ] mos = loadmat(join(DATABASE_DIR, MOS_META))['AllMOS_release'].squeeze().tolist() std = loadmat(join(DATABASE_DIR, STD_META))['AllStdDev_release'].squeeze().tolist() dataframe = DataFrame() dataframe['DIS_PATH'] = img_pathes dataframe['REF_PATH'] = img_dummy_refs dataframe['REF'] = img_dummy_refs dataframe['INDEX'] = mos dataframe['TYPE'] = img_types dataframe['STD'] = std dataframe.to_pickle(metafile_path)
def test_pickle_options(fsspectest): df = DataFrame({"a": [0]}) df.to_pickle("testmem://afile", storage_options={"test": "pickle_write"}) assert fsspectest.test[0] == "pickle_write" out = read_pickle("testmem://afile", storage_options={"test": "pickle_read"}) assert fsspectest.test[0] == "pickle_read" tm.assert_frame_equal(df, out)
def nice_charact(results_table1, **kwargs): """Returns nicely formatted DataFrame with process characteristics calculated from experimental data""" miu_high = estiamte_miu_high(results_table1['h'].values, results_table1['X'].values) Yxs = estiamte_Yxs(results_table1['X'].values, results_table1['S'].values) Yps = estiamte_Yxs(results_table1['P'].values, results_table1['S'].values) data_ch = DataFrame({'Parameters':['miu_high', 'Yxs', 'Yps']}) data_ch[kwargs['expno']] = [miu_high, Yxs, Yps] if 'path' in kwargs: # print("Saving requested...") if os.path.isfile(kwargs['path']+'.pickle'): # print("File exists...") all_data_ch = pd.read_pickle(kwargs['path']+'.pickle') all_data_ch[kwargs['expno']] = 0 all_data_ch.drop(kwargs['expno'], axis=1, inplace=True) all_data_ch = all_data_ch.merge(data_ch, on='Parameters') all_data_ch.to_pickle(kwargs['path']+'.pickle') all_data_ch.to_html(kwargs['path']+'.html') # print("File saved...") return all_data_ch else: print("File for characteristics does not exist...creating a new one.") data_ch.to_pickle(kwargs['path']+'.pickle') data_ch.to_html(kwargs['path']+'.html') # print("New file saved...") # print("Saving none...") return data_ch
def run(self): import pickle from pandas import DataFrame self.output().makedirs() with self.input().open('r') as f: rosters = pickle.load(f) cleaned_rosters = [] for team_id, roster in rosters.items(): goalie_ids = [player['person']['id'] for player in roster if player['position']['code'] == 'G' ] cleaned_rosters.extend([(team_id, self.season, goalie_id) for goalie_id in goalie_ids]) cleaned_rosters = DataFrame(cleaned_rosters) cleaned_rosters.columns = ['team_id', 'season', 'goalie_id'] with self.output().temporary_path() as temp_output_path: cleaned_rosters.to_pickle(temp_output_path, compression=None)
def process_matebook_data(directory, paramlist, storage_location): vidname = parse_screen_filename(directory) for filename in find_files(directory, 'track.tsv'): vidpath, flyID = parse_filename(filename) tag = vidname + "_" + flyID if not os.path.exists(storage_location + '/' + tag + '_arena.pickle'): fi = pd.read_table(filename, sep='\t', header=[0, 1], skiprows=[2, 3]) tempdf = DataFrame(index=fi.index) if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2: print "arena dropped for poor quality: ", tag continue elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0: print "arena dropped because quality = 1: ", tag continue elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <= 1: print "arena dropped because courtship = nan: ", tag continue else: for j in paramlist: tempdf[j[1]] = fi[j[0], j[1]] if 'movedAbs_u' in j: tempdf[j[1]] = tempdf[j[1]] * FPS tempdf['Time'] = tempdf.index / FPS time_ID = vidpath.split('_', 1)[-1].split('.', 1)[0] tempdf = merge_jvision_data(tempdf.reset_index(), time_ID) tempdf.to_pickle(storage_location + '/' + tag + '_arena.pickle') print ".....", tag, " processed to pickling." return
def save_ticker_data(self, ticker : str, data_source : str, # ie - IB, MW ... data : pd.DataFrame, data_type : str): """ @ ticker: - ticker name @ data_source - (str) represetnt where the data came from ie - IB = interactive brokers, MW = marketwatch ... @ data - the data it self @ data_type : (str) - the postfix of the file - pkl, csv ... """ # if no dir for ticker create tickerdir_path = "./" + TICKERS_DIR_NAME + "/" + ticker file_name = ticker + FILENAME_PREFIXES[data_source] df = data if (ticker not in os.listdir(os.getcwd() + "/" + TICKERS_DIR_NAME)): os.mkdir(tickerdir_path) file_path = tickerdir_path + "/" + file_name + "." + data_type if (data_type == "pkl"): data.to_pickle(file_path) elif (data_type == "csv"): data.to_csv(file_path) print("saved {}.{} at: {}".format(file_name, data_type, tickerdir_path))
def run(self): import pickle from pandas import DataFrame self.output().makedirs() with self.input()[0].open('r') as f: songs = pickle.load(f) songs = [song['track'] for song in songs] song_data = [(song['id'], song['name'], song['artists'][0]['id'], 'US' in song['available_markets'], song['duration_ms'], song['explicit'], song['uri'], song['preview_url']) for song in songs] song_data_df = DataFrame(song_data, columns=[ 'id', 'name', 'main_artist', 'available_in_us', 'duration_ms', 'explicit', 'uri', 'preview_url' ]) with self.output().temporary_path() as temp_path: song_data_df.to_pickle(temp_path, compression=None)
def store(self, name: str, data: pd.DataFrame): """Adds named dataframe to collection and stores its contents on disk.""" if name in self._table_ids: raise TableExists(f'Table {name} already present in the DFC.') with self._create_file(name) as fd: data.to_pickle(fd) self._table_ids[name] = self._instance_id
def write(df: pd.DataFrame, path: str, **kwargs) -> None: """Read file to DataFrame by file's extension. Args: df (DataFrame): DataFrame to write to disk. path (str): Path to write the file to. Supported file suffixes are: - csv - pkl (pickle) - hdf (HDF5) - dta (Stata) **kwargs: Arbitrary keyword arguments to pass to the ``pandas`` write method. Returns: None: """ file_type = path.split('.')[-1] if file_type == 'csv': df.to_csv(path, **kwargs) elif file_type in PICKLE_EXT: df.to_pickle(path, **kwargs) elif file_type in HDF5_EXT: mode = kwargs.pop('mode', 'w') df.to_hdf(path, 'df', mode=mode, **kwargs) elif file_type == 'dta': df.to_stata(path, **kwargs) else: err_str = 'File type {} is yet not supported.' raise NotImplementedError(err_str.format(file_type))
def gather_data(filelist): datadf = DataFrame() intvals = np.array([0, 200, 2000, 20000]) #6310 for x in filelist: FLY_ID = x.split('/')[-1].split('_fly.')[0] EXP_ID, DATE, TIME = FLY_ID.split('_', 4)[0:3] fx = pd.read_pickle(x) fx = fx[fx.columns] try: number_of_bouts, bout_duration, first_TS, last_TS = utilities.detect_stim_bouts( fx, 'Laser2_state') except: number_of_bouts = 1 stim_duration = find_nearest(intvals, fx['stim_duration'][0]) PC_wing = fx[ (fx.index >= pd.to_datetime(THRESH_ON * NANOSECONDS_PER_SECOND)) & (fx.index <= pd.to_datetime(THRESH_OFF * NANOSECONDS_PER_SECOND) )]['maxWingAngle'] WEI = float(PC_wing[PC_wing >= 0.524].count()) / float(PC_wing.count()) if WEI < WEI_THRESHOLD: print FLY_ID, " excluded from analysis, with wing extension index: ", WEI, "." continue fx['group'] = str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms' print str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms' fx['FlyID'] = FLY_ID datadf = pd.concat([datadf, fx]) datadf.to_csv(JAABA + HANDLE + '_rawdata_' + binsize + '.csv', sep=',') datadf.to_pickle(JAABA + 'JAR/' + HANDLE + '_rawdata_' + binsize + '.pickle')
def store_data(df: pd.DataFrame, station: str): out_file = Path().absolute() / "downloads" / f"{station}_hourly_discharge.p" #if not out_file.parent.is_dir(): # out_file.parent.mkdir(parents=True) #arr = xarray.Dataset.from_dataframe(df) #arr.to_netcdf(out_file) df.to_pickle(out_file, compression='gzip')
def process_data(filename, paramlist): fi = pd.read_table(filename, sep='\t', header=[0, 1], skiprows=[2, 3]) tempdf = DataFrame(index=fi.index) vidname, flyID = parse_filename(filename) tag = vidname + "_" + flyID if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2: print "arena dropped for poor quality: ", tag return elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0: print "arena dropped because quality = 1: ", tag return elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <= 1: print "arena dropped because courtship = nan: ", tag return else: for j in paramlist: tempdf[j[1]] = fi[j[0], j[1]] if 'movedAbs_u' in j: tempdf[j[1]] = tempdf[j[1]] * FPS if 'copulating' not in j: pass #tempdf[j[1]][fi['0', 'copulating'] == 1] = np.nan tempdf['Time'] = tempdf.index / FPS tempdf.to_pickle(JAR + tag + '_tempdf.pickle') print ".....", tag, "processed to pickling." return
def process_data(filename, paramlist): fi = pd.read_table(filename, sep='\t', header = [0,1], skiprows=[2,3]) tempdf = DataFrame(index = fi.index) vidname, flyID = parse_filename(filename) tag = vidname + "_" + flyID if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2: print "arena dropped for poor quality: ", tag return elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0: print "arena dropped because quality = 1: ", tag return elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <=1: print "arena dropped because courtship = nan: ", tag return else: for j in paramlist: tempdf[j[1]] = fi[j[0],j[1]] if 'movedAbs_u' in j: tempdf[j[1]] = tempdf[j[1]] * FPS if 'copulating' not in j: pass#tempdf[j[1]][fi['0', 'copulating'] == 1] = np.nan tempdf['Time'] = tempdf.index/FPS tempdf.to_pickle(JAR + tag + '_tempdf.pickle') print ".....", tag, "processed to pickling." return
def save_experiment_scorings(output_path: Path, method_id: str, scorings: pd.DataFrame): scorings_path = output_path / "scorings" # making sure that the folder is there scorings_path.mkdir(parents=True, exist_ok=True) scorings_filepath = scorings_path / (method_id + ".pickled") scorings.to_pickle(scorings_filepath)
def create_df(db='parking.min.db', save_as='parking.df.pickle'): conn = sqlite3.connect(db) rows = conn.execute('''select updated, park_id, free_places from parking_min''').fetchall() ids = list(set([t[1] for t in rows])) data = {} for x in ids: dates = [np.datetime64(r[0], 's') for r in rows if r[1] == x] # updated y = [r[2] for r in rows if r[1] == x] # free_places (target) data[x] = Series(y, index=dates) # convert data to DataFrame df = DataFrame(data) # get the names nr = conn.execute('''SELECT DISTINCT name FROM parking ORDER BY park_id''').fetchall() # replace non ascii chars names = [unicodedata.normalize('NFKD', x[0]).encode('ascii', 'ignore') for x in nr] # remove dots names = [x.replace(u'.', '') for x in names] # assign to columns df.columns = names # destroy where there all are NaNs df = df[pd.notnull(df).any(axis=1)] # save if save_as is not None: df.to_pickle(save_as) return df
def run(self): import pickle from pandas import DataFrame self.output().makedirs() with self.input().open('r') as f: teams = pickle.load(f) cleaned_teams = [] for team in teams: # skip teams that aren't active anymore if not team['active']: continue team_id = team['id'] team_name = team['name'] team_shortname = team['abbreviation'] cleaned_teams.append((team_id, team_name, team_shortname, )) cleaned_teams = DataFrame(cleaned_teams) cleaned_teams.columns = ['team_id', 'team_name', 'team_shortname'] with self.output().temporary_path() as temp_output_path: cleaned_teams.to_pickle(temp_output_path, compression=None)
def save_pickle(df: pd.DataFrame, out:Path) -> None: m = df.loc[0, "method"] p = Path(out / "raw" / re.sub(" ", "_", m[:-3])) if not p.is_dir(): p.mkdir(parents=True) f = p / f"{re.sub(' ', '_', m)}.pkl" df.to_pickle(f) print(f"Saved results to file at: {str(f)}")
def exportQuesAcceptedAns(self): file = "../../data/Ques-AcceptedAnswers" data = DataFrame([self.QuesId, self.UserId, self.AcceptedAnswerId], index=['Question', 'Questioner', 'AnswerId']) data = data.T data['Accepted'] = 1 data.to_pickle(file + '.data')
def calculate_nodes_cost(graph, nodes_load): # calculate nodes cost timeseries # each cell of pdataframe contain ts (1 hour sampling) with loads # we need calculate costs for procecing this volumes logging.info('Get graph edges for gettin time...') graph_edges = DataFrame(graph.edges(data=True), columns = ['from', 'to', 'info']) graph_edges['edge'] = Series(zip(graph_edges['from'], graph_edges['to'])) graph_edges['time'] = graph_edges['info'].apply(lambda row: get_value(row, 'time')).replace(inf, nan) # return inf for inf edges graph_edges['dist'] = graph_edges['info'].apply(lambda row: get_value(row, 'dist')).replace(inf, nan) # return inf for inf edges graph_edges['type'] = graph_edges['info'].apply(lambda row: get_value(row, 'type')).replace(inf, nan) # return inf for inf edges # DONE: change 'cost' to ' avg_cost' prev use not correct data control change graph_edges['avg_cost'] = graph_edges['info'].apply(lambda row: get_value(row, 'avg_cost')).replace(inf, nan) # convert time in seconds to hours graph_edges['time'] = graph_edges['time'].apply(lambda row: ceil(row/3600)) graph_edges = graph_edges[['edge', 'info', 'time', 'dist','type', 'avg_cost']] logging.info('Start calculating costs...') result = [] for edge in nodes_load['edge'].unique(): # get dataframe with volumes total_cost = nodes_load[nodes_load['edge'] == edge]['total_volumes'].iloc[0].resample('D').sum() dist = graph_edges[graph_edges['edge'] == edge]['dist'].iloc[0] time = graph_edges[graph_edges['edge'] == edge]['time'].iloc[0] edge_type = graph_edges[graph_edges['edge'] == edge]['type'].iloc[0] if edge_type == 'auto': # FIXME: !!!!!!calculation need initialize of avg_cost in frist iterration total_cost['cost'] = total_cost['sum_mass_kg']*graph_edges[graph_edges['edge'] == edge]['avg_cost'].iloc[0] total_cost[['total_cost', 'avg_loads', 'min_capacity', 'amount', 'types_list']] = \ total_cost.apply(lambda row: \ Series(minimal_auto_cost_func(dist, time, row['sum_mass_kg'])), axis=1) if edge_type == 'avia': total_cost['cost'] = total_cost['sum_mass_kg']*graph_edges[graph_edges['edge'] == edge]['avg_cost'].iloc[0] total_cost[['total_cost', 'avg_loads', 'min_capacity', 'amount', 'types_list']] = \ total_cost.apply(lambda row: \ Series(avia_cost(time, row['sum_mass_kg'])), axis=1) if edge_type == 'sort_center': total_cost['cost'] = total_cost['sum_mass_kg']*graph_edges[graph_edges['edge'] == edge]['avg_cost'].iloc[0] total_cost[['total_cost', 'avg_loads', 'min_capacity', 'amount', 'types_list']] = \ total_cost.apply(lambda row: \ Series(sort_center_cost(time, row['sum_mass_kg'])), axis=1) result.append((edge, total_cost)) nodes_cost = DataFrame(result, columns=['edge', 'total_cost']) logging.info('Edge cost calculated. Start saving...') date = datetime.today().strftime('%Y-%m-%d_%H:%M') path = '../result/' file_name = path + 'nodes_cost_' + date nodes_cost.to_pickle(file_name) logging.info('Edge cost calculated. Start saving...') file_name = path + 'graph_edges_' + date graph_edges.to_pickle(file_name) return nodes_cost
def gather_data(filelist): datadf = DataFrame() intvals = np.array([0, 200, 2000, 20000]) #6310 for x in filelist: FLY_ID = x.split('/')[-1].split('_fly.')[0] EXP_ID, DATE, TIME = FLY_ID.split('_', 4)[0:3] fx = pd.read_pickle(x) fx = fx[fx.columns] try: number_of_bouts, bout_duration, first_TS, last_TS = utilities.detect_stim_bouts(fx, 'Laser2_state') except: number_of_bouts = 1 stim_duration = find_nearest(intvals, fx['stim_duration'][0]) PC_wing = fx[(fx.index >= pd.to_datetime(THRESH_ON*NANOSECONDS_PER_SECOND)) & (fx.index <= pd.to_datetime(THRESH_OFF*NANOSECONDS_PER_SECOND))]['maxWingAngle'] WEI = float(PC_wing[PC_wing >= 0.524].count()) / float(PC_wing.count()) if WEI < WEI_THRESHOLD: print FLY_ID, " excluded from analysis, with wing extension index: " , WEI , "." continue fx['group'] = str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms' print str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms' fx['FlyID'] = FLY_ID datadf = pd.concat([datadf, fx]) datadf.to_csv(JAABA + HANDLE + '_rawdata_' + binsize + '.csv', sep=',') datadf.to_pickle(JAABA + 'JAR/'+ HANDLE + '_rawdata_' + binsize + '.pickle')
def get_all_artist_lyrics(artist_url): df = DataFrame(np.empty(0, dtype=[('artist_url', object), ('album_url', object), ('song_url', object), ('lyrics', object)])) album_urls = get_album_urls(artist_url) print "found album urls " + ", ".join(album_urls) row_num = 0 for album_url in album_urls: try: song_urls = get_song_urls(album_url) except: print "failed to get songs for " + album_url continue finally: print "got songs for " + album_url for song_url in song_urls: try: lyrics = extract_lyrics(song_url) except: print "failed to get lyrics for " + song_url finally: print "got lyrics for " + song_url df.loc[row_num] = [artist_url, album_url, song_url, lyrics] row_num += 1 # todo save line by line df.to_pickle(artist_url.split("/")[-1] + '.pkl') return df
def append_and_save(OG: pd.DataFrame, new: pd.DataFrame, path=PICKLE_PATH): # with open('replacements.csv', 'r') as replacements_file: # l = [x.strip().split(',') for x in replacements_file.readlines()] # replacements = {line[0]: line[1] for line in l} # # Drop the columns we really don't care about # new.drop(BLACKLIST, axis=1, errors='ignore', inplace=True) # for from_name, to_name in replacements.items(): # if from_name in new.columns and to_name in new.columns: # new[to_name].where(new[to_name].notnull(), new[from_name], inplace=True) # new.drop(columns=[from_name], axis=1, inplace=True) # # elif from_name in new.columns: # new.rename(columns={from_name: to_name}, inplace=True) # # new.drop(columns=[from_name], axis=1) # new = new.applymap(lambda x: x if type(x) is not str else x.lower().strip().replace(', ', ',')) # print('Saving, but this wont work for different websites') if len(OG) > 0: OG = OG.append(new, ignore_index=True, sort=True) # OG.drop(BLACKLIST, axis=1, errors='ignore', inplace=True) # OG = OG.apply(clean_column) OG.to_pickle(path) return OG else: # new = new.apply(clean_column) new.to_pickle(path) return new
def writeLog( self, sender_id: int, log_df: pd.DataFrame, filename: Optional[Union[str, PathLike, Path]] = None) -> None: # Called by any agent, usually at the very end of the simulation just before # kernel shutdown, to write to disk any log dataframe it has been accumulating # during simulation. The format can be decided by the agent, although changes # will require a special tool to read and parse the logs. The Kernel places # the log in a unique directory per run, with one filename per agent, also # decided by the Kernel using agent type, id, etc. # If there are too many agents, placing all these files in a directory might # be unfortunate. Also if there are too many agents, or if the logs are too # large, memory could become an issue. In this case, we might have to take # a speed hit to write logs incrementally. # If filename is not None, it will be used as the filename. Otherwise, # the Kernel will construct a filename based on the name of the Agent # requesting log archival. if self.skip_log: return path = joinpath("..", "log", self.log_dir) file = f"{filename or self.agents[sender_id].name.replace(' ', '')}.bz2" makedirs(path, exist_ok=True) log_df.to_pickle(joinpath(path, file), compression='bz2')
def make_test_dataset(pkl_filepath): """Creates a synthetic classification dataset to use for testing. Dataset is a pandas DataFrame written to pickle at the given path. """ # Create a synthetic classification dataset X, y = make_classification( n_samples=100, # 5 features, 1 will be pure noise n_features=5, n_informative=3, n_redundant=1, n_repeated=0, # Assign 10% of labels at random to add noise flip_y=0.1, shuffle=False, random_state=543, ) df = DataFrame(X, columns=["a", "b", "c", "d", "e"]) # Convert one of the informative columns to categorical (string): # Round values to integer and map integers to letters to_categ = df["c"].astype("int") to_categ_uniq = sorted(to_categ.unique()) categ = to_categ.map( dict(zip(to_categ_uniq, list(ascii_uppercase[:len(to_categ_uniq)])))) df["c"] = categ # Append the label column df["label"] = y df.to_pickle(pkl_filepath)
def export_dataframe(df: pd.DataFrame, workdir: str, name: str): logger.info(f"{name}:\n{df}") df.to_latex(os.path.join(workdir, f"{name}.tex")) with open(os.path.join(workdir, f"{name}.txt"), "w") as f: df.to_string(f) with open(os.path.join(workdir, f"{name}.csv"), "w") as f: df.to_csv(f) df.to_pickle(os.path.join(workdir, f"{name}.pickle"))
def write_frame( frame: pd.DataFrame, base_path: PathLike, name: str ) -> None: frame.to_pickle(os.path.join(os.path.join(base_path, name + '.pkl'))) with open(os.path.join(os.path.join(base_path, name + '.csv')), 'w+') as handle: frame.to_csv(handle)
def add_to_cache(self, parameter_id: str, dt_range: DateTimeRange, df: pds.DataFrame): fname = self.data_folder + '/' + str(uuid.uuid4()) if df is not None: df.to_pickle(fname) self.cache.add_entry(parameter_id, CacheEntry(dt_range, fname)) else: self.cache.add_entry(parameter_id, CacheEntry(dt_range, None))
def gather_data(filelist): datadf = DataFrame() for x in filelist: FLY_ID, FMF_TIME, GROUP = parse_fmftime(x) fx = pd.read_pickle(x) rel = fx[['Laser_state', 'maxWingAngle', 'Length', 'Width']] rel['group'] = GROUP rel['FlyID'] = FLY_ID datadf = pd.concat([datadf, rel]) datadf.to_csv(JAABA + 'rawdata_' + binsize + '.csv', sep=',') datadf.to_pickle(JAABA + 'JAR/rawdata_' + binsize + '.pickle')
def preeditimage(input_file, output_dir, params): """ Segment the specified grayscale images, and save the binary image to file. First, clean the image by removing the background and filtering it, then find the edges and threshold it to convert it to a binary image. Extract and verify the data from this image. args: input_file (file): input directory of raw data output_dir (path): output directory to save file params (dict): input parameters """ # Do not overwrite existing output output_file = os.path.join(output_dir, os.path.basename(input_file)) if os.path.isfile(output_file): img = imread(output_file) else: # Segment the grayscale image and save to file img = segment.main(imread(input_file), params['segment']) imsave(output_file, img) print ' - segment: ' + time.asctime() # Do not overwrite existing output output_file2 = os.path.splitext(output_file)[0] + '.pickle' if os.path.isfile(output_file2): return # Extract properties from the labeled image and save as a DataFrame data = extract.preedit(img, params['extract']) columns = ('Area', 'BoundingBox', 'Centroid', 'EdgeSpline', 'FourierFit', 'Length', 'MidSpline', 'Perimeter', 'StalkedPole', 'SwarmerPole') f = read.getframenum(input_file, params['segment']['pattern']) if data: # Make MultiIndex with frame and label info j = [f] * len(data) k = [v['Label'] for v in data] else: # Create empty DataFrame data = [dict.fromkeys(columns, np.nan)] j = [f] k = [-1] index = MultiIndex.from_arrays((j, k), names=('Frame', 'Label')) df = DataFrame(data, columns=columns, index=index) verify.preedit(df, params['verify']) df.to_pickle(output_file2) print ' - extract: ' + time.asctime()
def pagetest2(): import numpy as np from matplotlib import pyplot as plt from pandas import Series, DataFrame import pandas as pd from io import StringIO df = DataFrame(np.random.rand(6,4), index=["One", "Two", "Three", "Four", "Five", "Six"], columns=pd.Index(["A", "B", "C", "D"], name="Genus")) buf = StringIO() df.to_pickle(buf) response = make_response(buf.getvalue()) response.headers['Content-Type'] = 'Image/png' #response.headers['Content-Type'] = 'text/html;charset=utf8' return response
def compile_data(files): print 'compiling...' rawfile = DataFrame({'Time':[]}) dflist = [] vidlist = [] flyIDlist = [] for x in files: tempdf = pd.read_pickle(x) dflist.append(tempdf) vidname, flyID = parse_tempdf_name(x) vidlist.append(vidname) flyIDlist.append(flyID) rawfile = pd.concat(dflist, keys=zip(vidlist,flyIDlist), names=['Video','Arena']) rawfile.to_csv(OUTPUT + 'rawfile.csv', sep=',') rawfile.to_pickle(JAR + 'rawfile.pickle') return rawfile
class Pickle(BaseIO): def setup(self): self.fname = '__test__.pkl' N = 100000 C = 5 self.df = DataFrame(np.random.randn(N, C), columns=['float{}'.format(i) for i in range(C)], index=date_range('20000101', periods=N, freq='H')) self.df['object'] = tm.makeStringIndex(N) self.df.to_pickle(self.fname) def time_read_pickle(self): read_pickle(self.fname) def time_write_pickle(self): self.df.to_pickle(self.fname)
def compile_data(pickle_jar): print 'compiling...' rawfile = DataFrame({'Time':[]}) dflist = [] vidlist = [] flyIDlist = [] for x in glob.glob(pickle_jar + '/*arena.pickle'): tempdf = pd.read_pickle(x) dflist.append(tempdf) vidname, flyID = parse_tempdf_name(x) vidlist.append(vidname) flyIDlist.append(flyID) rawfile = pd.concat(dflist, keys=flyIDlist, names=['Arena']) rawfile = rawfile.reset_index() #rawfile.to_csv(OUTPUT + 'rawfile.csv', sep=',') rawfile.to_pickle(pickle_jar + '/' + vidname + '_compiled.pickle') return rawfile
def gather_data(filelist): datadf = DataFrame() for x in filelist: print x FLY_ID = x.split('/')[-1].split('_fly.')[0] EXP_ID, DATE, TIME = FLY_ID.split('_', 4)[0:3] fx = pd.read_pickle(x) fx = fx[fx.columns] PC_wing = fx[(fx.index >= pd.to_datetime(THRESH_ON*NANOSECONDS_PER_SECOND)) & (fx.index <= pd.to_datetime(THRESH_OFF*NANOSECONDS_PER_SECOND))]['maxWingAngle'] WEI = float(PC_wing[PC_wing >= 0.524].count()) / float(PC_wing.count()) if WEI < WEI_THRESHOLD: print FLY_ID, " excluded from analysis, with wing extension index: " , WEI , "." continue fx['group'] = EXP_ID fx['FlyID'] = FLY_ID datadf = pd.concat([datadf, fx]) datadf.to_csv(JAABA + HANDLE + '_rawdata_' + binsize + '.csv', sep=',') datadf.to_pickle(JAABA + 'JAR/'+ HANDLE + '_rawdata_' + binsize + '.pickle')
def test_to_csv_with_dst_transitions(self): with ensure_clean('csv_date_format_with_dst') as path: # make sure we are not failing on transitions times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00", tz="Europe/London", freq="H", ambiguous='infer') for i in [times, times + pd.Timedelta('10s')]: time_range = np.array(range(len(i)), dtype='int64') df = DataFrame({'A': time_range}, index=i) df.to_csv(path, index=True) # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/London') assert_frame_equal(result, df) # GH11619 idx = pd.date_range('2015-01-01', '2015-12-31', freq='H', tz='Europe/Paris') df = DataFrame({'values': 1, 'idx': idx}, index=idx) with ensure_clean('csv_date_format_with_dst') as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/Paris') result['idx'] = to_datetime(result['idx']).astype( 'datetime64[ns, Europe/Paris]') assert_frame_equal(result, df) # assert working df.astype(str) with ensure_clean('csv_date_format_with_dst') as path: df.to_pickle(path) result = pd.read_pickle(path) assert_frame_equal(result, df)
# save the intensity plot: E_out = f_E_out() E2_out = f_E2_out() ax.imshow(E2_out[300:400,300:400], vmin=0, vmax=1, **plot_args) ax.set_title('Intensity'); ax2.imshow(E_out[0][300:400,300:400], vmin=-1, vmax=1, **plot_args) ax2.set_title('Re(E)'); fig_name = os.path.join(plotdir, '{n:06d}.png'.format(n=n)) plt.savefig(fig_name) if n % update_frequency == 0: # also renormalise the update rate: phi_rate_avg = np.mean(np.abs(f_phi_updates())) l_rate = np.min([update_rate_target / phi_rate_avg, 1.2*l_rate]) # can go up by 20% at the most. updates = ((slmOpt.phi, slmOpt.phi - l_rate * slmOpt.phi_rate), (slmOpt.phi_rate, momentum*slmOpt.phi_rate + (1.-momentum)*grad)) update = theano.function([], cost, updates=updates, on_unused_input='warn') print 'Finished gradient descent, saving summary.' # create and save the dataframe with the learning curves: df = DataFrame({'Cost_SE': l_cost_SE, 'Cost_QE': l_cost_QE, 'Mean_update': l_mean_update, 'Max_update': l_max_update}) df.to_pickle(os.path.join(outputdir, 'summary.pkl')) sys.exit()
def authorization(request): client = Client() code = request.GET['code'] access_token = client.exchange_code_for_token(client_id=MY_STRAVA_CLIENT_ID, client_secret=MY_STRAVA_CLIENT_SECRET, code=code) # making a global variable to be used across views. don't know how this will work in practice client = Client(access_token=access_token) athlete = client.get_athlete() # Get current athlete details global athleteId athleteId = athlete.id # if athlete doesn't exist, add them if len(Athlete.objects.filter(athleteId=athleteId)) == 0: ath = Athlete.objects.create(name=str(athlete.firstname+' '+athlete.lastname), athleteId=athleteId, profilePic=athlete.profile, city=athlete.city, country=athlete.country, sex=athlete.sex, premium=athlete.premium, created_at=athlete.created_at, updated_at=athlete.updated_at, followers=athlete.follower_count, friends=athlete.friend_count, email=athlete.email, weight=athlete.weight, meas_pref=athlete.measurement_preference, runsSummary = DataFrame({}).to_json(orient='records'), fitLines = DataFrame({}).to_json(orient='records'), masterList = DataFrame({}).to_json(orient='records')) ath.profilePic.name = "rudyzPic" ath.save(update_fields=['profilePic']) # if athlete already exists, draw their file elif len(Athlete.objects.filter(athleteId=athleteId)) == 1: ath = Athlete.objects.get(athleteId=athleteId) ############################################ ##### compiling new runs, updating summary # athlete's existing runs summary existingSummary = DataFrame(pd.read_json(ath.runsSummary)) existingFitlines = DataFrame(pd.read_json(ath.fitLines)) masterList = DataFrame(pd.read_json(ath.masterList)) activities = list(client.get_activities()) # activity IDs of runs already in the system try: ids = existingSummary.activityId except AttributeError: ids = [] for i in range(len(activities)): #for i in range(30,37): # Ignoring activities already in the system if (len(ids) == 0) or (float(activities[i].id) not in list(ids)): try: # compiling df for raw json-ization activityId = activities[i].id run = client.get_activity_streams(activityId, types=['time','latlng','distance','heartrate','altitude','cadence']) latlng = run['latlng'].data time = run['time'].data distance = run['distance'].data heartrate = run['heartrate'].data altitude = run['altitude'].data cadence = run['cadence'].data date = activities[i].start_date_local activity = activityId dfi = thresher.assemble(date, activityId, heartrate, distance, time, altitude, latlng, cadence) # basic cleanup, only removing totally unreasonable values dfi = thresher.basicClean(dfi) # if we ever want to try our hand at improving strava's speed data (ie by predicting speed when GPS blanks), intervene here: #dfi = thresher.addDistDeltas(dfi) try: fitline = thresher.getFitlineLws(dfi) # this adds speed-shifted columns except: fitline = pd.DataFrame({}) try: mafScore = fitline[fitline.hr == 140.0].avgSpeed.iloc[0] print "MAF " print mafScore except: mafScore = np.nan fitline_json = fitline.to_json(orient='records') # getting summary info for run (as one-entry dict) runSummary = thresher.getSingleSummaryDf(dfi) # adding mafScore to summary runSummary['mafScore'] = mafScore print runSummary # adding predicted hr and speed values #dfi = thresher.getPred(dfi) # saving entry to database Activity.objects.create(act_id = activityId, name=str(activities[i].name), description=activities[i].description, act_type=activities[i].type, date=activities[i].start_date_local, timezone=activities[i].timezone, df=dfi.to_json(orient='records'), avgHr=runSummary['avgHr'], hrVar=runSummary['variation'], realMiles=runSummary['realMiles'], recovery=runSummary['recovery'], easy=runSummary['easy'], stamina=runSummary['stamina'], impulse=runSummary['impulse'], totalTime=runSummary['totalTime'], totalDist=runSummary['totalDist'], climb=runSummary['climb'], fitline=fitline_json, mafScore=mafScore, athlete=ath) # updating runs summary existingSummary = existingSummary.append(runSummary, ignore_index=True) existingFitlines = existingFitlines.append(fitline, ignore_index=True) masterList = masterList.append(dfi, ignore_index=True) except: continue # saving updated runs summary to athlete profile ath.runsSummary = existingSummary.to_json(orient='records') ath.save(update_fields=['runsSummary']) existingSummary.to_pickle("runsSummary.txt") # saving updated runs summary to athlete profile ath.fitLines = existingFitlines.to_json(orient='records') ath.save(update_fields=['fitLines']) ath.masterList = masterList.to_json(orient='records') ath.save(update_fields=['masterList']) # testing... existingSummary = pd.read_json(ath.runsSummary) #print(existingSummary) existingFitlines = pd.read_json(ath.fitLines) #print(existingFitlines) global path path = os.path.dirname(__file__) # updating dataframe, pickling for use in other views #global df #df = thresher.masterAssemble(client) masterDf = pd.read_json(ath.masterList) #print(masterDf) masterDf.to_pickle(str(path)+"/"+str(athlete.id)+"masterDf.txt") return render(request, 'stravaChimp/authorization.html', {'code':code, 'access_token':access_token, 'athleteId':athleteId})
initValue = 15 softmax = True skip = True category = 'Softmax or epsilon-greedy2' game = pg.PredatorGame((0,0), (5,5), (11,11)) if not skip: results = dict() results['epsilon-greedy'], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, False) results['softmax'], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, True) results['episode'] = range(0,episodes) dataF = DataFrame(results) dataF.to_pickle('data/'+category+str(softmax)) else: dataF = pd.read_pickle('data/'+category+str(softmax)) category = 'Softmax or epsilon-greedy' episodeData = pd.melt(dataF, id_vars=['episode'], var_name=category) p = ggplot(episodeData, aes('episode', 'value', color=category)) +\ geom_line() +\ theme_bw() + theme() + ylab("Steps") + xlab("Episodes") + ylim(0,60) print p category = 'Softmax or epsilon-greedy2' ggsave(p, "plots/"+category+str(softmax)+".png") ggsave(p, "plots/"+category+str(softmax)+".pdf")
def run_bl_analysis( pickles_folder = 0 ): import matplotlib.pyplot as plt from os import listdir from os.path import join from pandas import read_pickle, DataFrame from article2_time_resolved_routines import find_nearest if not pickles_folder: pickles_folder = '/home/carlos/Documents/PhD/Articles/'+\ 'Article_3/Scripts/time_resolved/averaged_data' case_pickles = [ f for f in listdir( pickles_folder ) if f.endswith(".p") \ if not 'Slit' in f and 'alpha0' in f and 'phi0' in f\ and not "mean_flow_rotated" in f ] bl_df = DataFrame() fig, ax = plt.subplots( 1, 1 ) for cp in case_pickles: case_bl_df = DataFrame() df = read_pickle( join( pickles_folder, cp ) ) df = df.sort_values( by = [ 'x', 'y' ] ) if 'loc00' in cp and not 'STE' in cp: x_bl_loc = 38 elif 'loc05' in cp: x_bl_loc = 18 elif 'loc10' in cp or 'STE' in cp: x_bl_loc = -2 available_x_loc = find_nearest( x_bl_loc, df.x.values ) trailing_edge,phi,alpha,U,z = \ decript_case_name(cp) case_name = "{0}_a{1}_p{2}_U20_z{3:02.0f}_tr".\ format( trailing_edge, alpha, phi, float(z)*20 ) print " Running {0}".format(case_name) # First get the edge velocity, because it needs to be cleaned up a bit # ue_df = DataFrame() for x in df.x.unique(): local_x_df = df[ ( df.x == x ) & ( df.y >= 0 ) ] ue_df = ue_df.append( { 'U_e' : get_edge_velocity( local_x_df ), 'x' : x}, ignore_index = True ) # ###################################################################### ue_df = clean_data( ue_df, 'U_e' , window = 10, threshold = 1.0 ) for x , U_e_x in zip( ue_df.x.values, ue_df.U_e.values ): local_x_df = df[ ( df.x == x ) & ( df.y >= 0 ) & ( df.y < 20 ) ] if x == available_x_loc: ax.plot( local_x_df.u / U_e_x, local_x_df.y, label = cp.replace("_"," ") ) U_e_loc, delta_99, delta_displacement, delta_momentum = \ get_boundary_layer_values( local_x_df, U_e_x ) data = { 'case': case_name, 'Ue': U_e_x, 'delta_99': delta_99, 'delta_displacement': delta_displacement, 'delta_momentum': delta_momentum, 'x': x, 'trailing_edge': trailing_edge, 'phi': phi, 'alpha': alpha, 'z': z } case_bl_df = case_bl_df.append( DataFrame( data, index = [0] ), ignore_index = True ) if 'delta_99' in case_bl_df.columns: case_bl_df = clean_data( case_bl_df, 'delta_99', window = 10 , threshold = 1.0 ) bl_df = bl_df.append( case_bl_df, ignore_index = True ) bl_df.to_pickle("BLData_staged.p") plt.legend( loc = 'best' ) plt.xlim( 0, 1 ) plt.savefig( "InterestingBLs.png" )
def test_detect_chained_assignment(self): pd.set_option('chained_assignment', 'raise') # work with the chain expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB')) df = DataFrame(np.arange(4).reshape(2, 2), columns=list('AB'), dtype='int64') assert df.is_copy is None df['A'][0] = -5 df['A'][1] = -6 tm.assert_frame_equal(df, expected) # test with the chaining df = DataFrame({'A': Series(range(2), dtype='int64'), 'B': np.array(np.arange(2, 4), dtype=np.float64)}) assert df.is_copy is None with pytest.raises(com.SettingWithCopyError): df['A'][0] = -5 with pytest.raises(com.SettingWithCopyError): df['A'][1] = np.nan assert df['A'].is_copy is None # Using a copy (the chain), fails df = DataFrame({'A': Series(range(2), dtype='int64'), 'B': np.array(np.arange(2, 4), dtype=np.float64)}) with pytest.raises(com.SettingWithCopyError): df.loc[0]['A'] = -5 # Doc example df = DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c': Series(range(7), dtype='int64')}) assert df.is_copy is None with pytest.raises(com.SettingWithCopyError): indexer = df.a.str.startswith('o') df[indexer]['c'] = 42 expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]}) df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]}) with pytest.raises(com.SettingWithCopyError): df['A'][0] = 111 with pytest.raises(com.SettingWithCopyError): df.loc[0]['A'] = 111 df.loc[0, 'A'] = 111 tm.assert_frame_equal(df, expected) # gh-5475: Make sure that is_copy is picked up reconstruction df = DataFrame({"A": [1, 2]}) assert df.is_copy is None with tm.ensure_clean('__tmp__pickle') as path: df.to_pickle(path) df2 = pd.read_pickle(path) df2["B"] = df2["A"] df2["B"] = df2["A"] # gh-5597: a spurious raise as we are setting the entire column here from string import ascii_letters as letters def random_text(nobs=100): df = [] for i in range(nobs): idx = np.random.randint(len(letters), size=2) idx.sort() df.append([letters[idx[0]:idx[1]]]) return DataFrame(df, columns=['letters']) df = random_text(100000) # Always a copy x = df.iloc[[0, 1, 2]] assert x.is_copy is not None x = df.iloc[[0, 1, 2, 4]] assert x.is_copy is not None # Explicitly copy indexer = df.letters.apply(lambda x: len(x) > 10) df = df.loc[indexer].copy() assert df.is_copy is None df['letters'] = df['letters'].apply(str.lower) # Implicitly take df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) df = df.loc[indexer] assert df.is_copy is not None df['letters'] = df['letters'].apply(str.lower) # Implicitly take 2 df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) df = df.loc[indexer] assert df.is_copy is not None df.loc[:, 'letters'] = df['letters'].apply(str.lower) # Should be ok even though it's a copy! assert df.is_copy is None df['letters'] = df['letters'].apply(str.lower) assert df.is_copy is None df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) df.loc[indexer, 'letters'] = ( df.loc[indexer, 'letters'].apply(str.lower)) # an identical take, so no copy df = DataFrame({'a': [1]}).dropna() assert df.is_copy is None df['a'] += 1 # Inplace ops, originally from: # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] b = [123, None] c = [1234, 2345] d = [12345, 23456] tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'), ('ears', 'right')] events = {('eyes', 'left'): a, ('eyes', 'right'): b, ('ears', 'left'): c, ('ears', 'right'): d} multiind = MultiIndex.from_tuples(tuples, names=['part', 'side']) zed = DataFrame(events, index=['a', 'b'], columns=multiind) with pytest.raises(com.SettingWithCopyError): zed['eyes']['right'].fillna(value=555, inplace=True) df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0].sort_values() tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) tm.assert_series_equal(s, df[0].sort_values()) # see gh-6025: false positives df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]}) str(df) df['column1'] = df['column1'] + 'b' str(df) df = df[df['column2'] != 8] str(df) df['column1'] = df['column1'] + 'c' str(df) # from SO: # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=['count']) df['group'] = 'b' with pytest.raises(com.SettingWithCopyError): df.iloc[0:5]['group'] = 'a' # Mixed type setting but same dtype & changing dtype df = DataFrame(dict(A=date_range('20130101', periods=5), B=np.random.randn(5), C=np.arange(5, dtype='int64'), D=list('abcde'))) with pytest.raises(com.SettingWithCopyError): df.loc[2]['D'] = 'foo' with pytest.raises(com.SettingWithCopyError): df.loc[2]['C'] = 'foo' with pytest.raises(com.SettingWithCopyError): df['C'][2] = 'foo'
elif nPreds == 2: game.predCoords = game.initPredCoords = [(0, 0), (10, 10)] elif nPreds == 3: game.predCoords = game.initPredCoords = [(0, 0), (10, 10), (0,10)] elif nPreds == 4: game.predCoords = game.initPredCoords = [(0, 0), (10, 10), (0,10), (10,0)] results[nPreds], avgRMS, randomReturnValues[nPreds] = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax) winRatioDict[nPreds] = randomReturnValues[nPreds]['winratio'] else: sys.exit() results['episode'] = range(1,episodes+1) winRatioDict['episode'] = range(1,episodes+1) dataF_steps = DataFrame(results) dataF_steps.to_pickle('data/Q_steps'+str(samples)+str(episodes)+category+str(softmax)) dataF_winratio = DataFrame(winRatioDict) dataF_winratio.to_pickle('data/Q_winratio'+str(samples)+str(episodes)+category+str(softmax)) else: dataF_steps = pd.read_pickle('data/Q_steps'+str(samples)+str(episodes)+category+str(softmax)) dataF_winratio = pd.read_pickle('data/Q_winratio'+str(samples)+str(episodes)+category+str(softmax)) if graphtype == 'steps': dataToPlot = dataF_steps ylabel = 'Steps' elif graphtype == 'winratio': dataToPlot = dataF_winratio ylabel = 'Win Ratio' if smoothing:
initValue = 15 theta=0.00001 softmax = False skip = False game = pg.PredatorGame((0,0), (5,5), (11,11)) if not skip: results = dict() for initValue in [0, 1, 10, 15]: results[initValue] = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax) print initValue results['episode'] = range(0,episodes) dataF = DataFrame(results) dataF.to_pickle('data/initValues'+str(episodes)) else: dataF = pd.read_pickle('data/initValues'+str(episodes)) episodeData = pd.melt(dataF, id_vars=['episode'], var_name='initValue') # plt.ioff() #x = qplot(range(0,4), [0.68834, 0.76024, 0.82407, 0.82113], geom = ["point", "line"]) #print x # print qplot([0,1], [0.68834, 0.76024]) p = ggplot(episodeData, aes('episode', 'value', color='initValue')) +\ geom_line() +\ theme_bw() + theme() + ylab("Steps") + xlab("Episodes") + ylim(0,60) print p ggsave(p, "plots/initValues"+str(episodes)+".png") ggsave(p, "plots/initValues"+str(episodes)+".pdf")
class WingDetector(object): def __init__(self, zoomFMF_filepath, bag_filepath, dTarget, arena_centre, RETRACK, tempdir=None ): self.fmf_file = zoomFMF_filepath self.fmf = FMF.FlyMovie(self.fmf_file) self.bag_fn = bag_filepath self.bagdf = self.get_data_from_bag(self.bag_fn) self.bagdf = self.compute_body_axes(self.bagdf) self.positions = self.get_positions_from_bag(self.bag_fn) self.positions.loc[self.positions['Px'] == 1000000, 'Px'] = np.nan self.positions.loc[self.positions['Py'] == 1000000, 'Py'] = np.nan self.dTarget = dTarget (self.arena_centre) = arena_centre if tempdir is not None: self.saveImage = True if tempdir[-1] == '/': pass else: tempdir = tempdir + '/' self._tempdir = tempdir else: self.saveImage = False self.DEBUGGING_DIR = self.fmf_file.rsplit('/',1)[0] + '/tracking_cache' if not os.path.exists(self.DEBUGGING_DIR) == True: os.makedirs(self.DEBUGGING_DIR) self.DEBUGGING_DIR = self.DEBUGGING_DIR + '/' self.error_count = 0 self.ERROR_REPORTING = False self.retrack = RETRACK self.font = cv2.FONT_HERSHEY_SIMPLEX self.previous_head_extended = None self.flipped = 0 self.adjust_tracking_parameters = ((0,0,0),(0,0,0),(0,0,0)) self.total_errors = 0 self.wingData = DataFrame({'BodyAxis':[], 'leftAngle':[], 'leftWingLength':[], 'Length':[], 'rightAngle':[],'rightWingLength':[],'target_angle_TTM':[], 'target_distance_TTM':[], 'Timestamp':[],'Width':[]}, dtype=np.float64) self.tracking_info = DataFrame({'a_wingAngle_left':[],'a_wingArea_left':[],'b_wingAngle_right':[], 'b_wingArea_right':[], 'c_head_location_x':[],'c_head_location_y':[], 'd_bodyAxis':[], 'e_centroid_x':[], 'e_centroid_y':[], 'f_dTarget_TTM':[], 'g_approachAngle_TTM':[]}, dtype=np.float64) self.wingMetrics = DataFrame({'leftArea':[],'leftLength':[],'leftTheta':[], 'rightArea':[],'rightLength':[],'rightTheta':[]}, dtype=np.float64) def execute(self): total_frames = self.fmf.get_n_frames() if not self.ERROR_REPORTING: progress = self.get_progress_bar("TRACKED", total_frames) else: pass if (os.path.exists(self.DEBUGGING_DIR + 'wingdata_cache.pickle')) and not (self.retrack): self.wingData = pd.read_pickle(self.DEBUGGING_DIR + 'wingdata_cache.pickle') self.wingData.columns= ['BodyAxis','leftAngle','leftWingLength','Length','rightAngle','rightWingLength','target_angle_TTM', 'target_distance_TTM','Timestamp','Width'] startframe = self.wingData.index[-1] print self.fmf_file.split('/')[-1], ': beginning from cache at: ', startframe else: startframe = 0 for frame_number in range(startframe,total_frames,1): if self.ERROR_REPORTING: progress = self.get_progress_bar("ERROR_RATE", 2*frame_number+1) progress.update(self.total_errors+1) else: progress.update(frame_number) self.ERROR_DETECTED= False self.error_count = 0 self.adjust_tracking_parameters = ((0,0,0),(0,0,0),(0,0,0)) try: self.detectWings(self.saveImage, False, frame_number) #MAKE FIRST OPTION TRUE TO SAVE TRACKING MOVIES. except: continue print self.fmf_file.split('/')[-1], 100.0*self.total_errors/total_frames, '% error rate' return def make_movie(self,imagepath,filename,mp4fps): #write x264 mp4 tmpmov = "%s/movie.y4m" % imagepath sh.mplayer("mf://%s/*.png" % imagepath, "-mf", "fps=%d" % mp4fps, "-vo", "yuv4mpeg:file=%s" % tmpmov, "-ao", "null", "-nosound", "-noframedrop", "-benchmark", "-nolirc" ) sh.x264("--output=%s" % filename, "%s" % tmpmov, ) try: os.unlink(tmpmov) shutil.rmtree(self._tempdir) except OSError: pass def get_progress_bar(self, name, maxval): widgets = ["%s: " % name, progressbar.Percentage(), progressbar.Bar(), progressbar.ETA()] pbar = progressbar.ProgressBar(widgets=widgets,maxval=maxval).start() return pbar def get_wingAngle(self, frame_number): t, L, R = self.detectWings(frame_number) return t, L, R def devignette(self, frame): if int(self.fmf_file.rsplit('_')[-2]) >= 151006: V_coeff =[ 0.608421,0.000660594,0.00071838, -6.83654e-07,2.29008e-07,-6.11814e-07, -8.79999e-11,-1.63231e-10,-2.10072e-11,-2.10298e-10] else: V_coeff = [ 5.198890393267561e-01, 1.217460251226269e-03, 1.189236244172212e-03, -1.476571361684494e-06, -6.157281314884152e-07, -1.611555274365404e-06, 2.521929214022170e-10, 4.392272775279915e-10, 2.268726532499034e-10, 4.244172315090120e-10] mask = np.ones([len(frame[0]), len(frame)]) xx, yy = np.meshgrid(np.arange(0,len(frame[0]),1), np.arange(0,len(frame),1)) V_fit = mask*V_coeff[0] + xx*V_coeff[1] + yy*V_coeff[2] + xx**2*V_coeff[3] + xx*yy*V_coeff[4] + yy**2*V_coeff[5] + xx**3*V_coeff[6] + xx**2*yy*V_coeff[7] + xx*yy**2*V_coeff[8] + yy**3*V_coeff[9] devign = (frame / V_fit).astype(np.uint8) return devign def get_data_from_bag(self, bagfile): bag = rosbag.Bag(bagfile) head_x = [] head_y = [] body_x = [] body_y = [] times = [] for topic, msg, t in bag.read_messages('/flymad/laser_head_delta'): head_x.append(msg.head_x) head_y.append(msg.head_y) body_x.append(msg.body_x) body_y.append(msg.body_y) times.append((t.secs + t.nsecs*1e-9)) newdf = pd.DataFrame({'Timestamp':times, 'Hx':np.around(head_x), 'Hy':np.around(head_y), 'Bx':np.around(body_x), 'By':np.around(body_y)}) newdf = newdf[newdf.Hx < 1000000] #failed detection msgs are filled with value 1e6. newdf = utilities.convert_timestamps(newdf) return newdf def get_positions_from_bag(self, bagfile): bag = rosbag.Bag(bagfile) px = [] py = [] times = [] for topic, msg, t in bag.read_messages('/flymad/raw_2d_positions'): try: px.append(msg.points[0].x) py.append(msg.points[0].y) except: px.append(1000000) py.append(1000000) times.append((t.secs + t.nsecs*1e-9)) newdf = pd.DataFrame({'Timestamp':times, 'Px':np.around(px), 'Py':np.around(py)}) newdf = utilities.convert_timestamps(newdf) return newdf def compute_body_axes(self, newdf): # calculate 'norm' the distance between body and head points: newdf['norm'] = np.sqrt((newdf.Hx-newdf.Bx)**2 + (newdf.Hy-newdf.By)**2) newdf['slope'] = (newdf.Hy-newdf.By) / (newdf.Hx-newdf.Bx) newdf['perp'] = -1*(newdf.Hx-newdf.Bx) / (newdf.Hy-newdf.By) newdf['yint'] = newdf.Hy - (newdf.slope * newdf.Hx) newdf['perpInt'] = newdf.Hy - (newdf.perp * newdf.Hx) return newdf def detectWings(self, saveImage, debugging=False, framenumber=0):#, bodyThresh, wingThresh): frame, timestamp = self.fmf.get_frame(framenumber) timestamp_FMT = pd.to_datetime(timestamp, unit='s', utc=True).tz_convert('US/Eastern') timestring = "%.2f" % (pd.to_datetime(timestamp) - pd.to_datetime(0)).total_seconds() # COMPUTER VISION: frame = self.devignette(frame) im = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR) #must be uint8 array imgray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY) kernel = np.ones((5,5),np.uint8) Px = self.positions.Px.asof(timestamp_FMT) #SILLY HACK FOR 'MISMATCHING' INDICES. STUPID PANDAS. Py = self.positions.Py.asof(timestamp_FMT) if Px == np.nan or Py == np.nan: if self.saveImage == True: imcopy = im.copy() cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy) self.wingData.loc[framenumber] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, timestamp, np.nan] return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, timestamp, np.nan distance = self.get_distance_between_coords((Px,Py), self.arena_centre) targ_dist = self.dTarget.asof(timestamp_FMT) #FLY FEATURES DERIVED FROM BAG FILE: try: centroid, head = self.get_centroid_and_head(timestamp_FMT) backPoint = tuple(sum(y) / len(y) for y in zip(centroid, head)) headLine = self.compute_perpendicular_from_points(head, centroid) axisLine = self.compute_axis_from_points(head, centroid) bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist) BagData = True except: centroid, head = (0,0),(0,0) bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist) BagData = False #FIT ELLIPSE TO BODY: ret2, body = cv2.threshold(imgray, ellThresh[0], 255, cv2.THRESH_BINARY) #ellipseFitter = cv2.dilate(body, kernel, iterations=ellThresh[1]) ellipseFitter = cv2.erode(body, kernel, iterations=ellThresh[2]) contourImage = ellipseFitter.copy() bodyCont, hierarchy1 = cv2.findContours(contourImage, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) bodyEllipse=None bodyContour=None if BagData: for cnt in bodyCont: if cv2.contourArea(cnt) <=900000: if cv2.contourArea(cnt) >= 7000: ellipse= cv2.fitEllipse(cnt) if self.pointInEllipse(centroid[0],centroid[1],ellipse[0][0],ellipse[0][1],ellipse[1][0],ellipse[1][1],ellipse[2]): bodyEllipse = ellipse bodyContour=cnt slope = self.convert_ellipseAngle_to_slope(bodyEllipse[2]) yint = -1.0*slope*bodyEllipse[0][0] + bodyEllipse[0][1] xint = (-1.0*yint / slope) axisLine = slope, yint, xint head = self.pointOfIntersection(headLine[0],headLine[1], axisLine[0], axisLine[1]) if bodyEllipse == None: for cnt in bodyCont: if cv2.contourArea(cnt) <=900000: if cv2.contourArea(cnt) >= 7000: ellipse= cv2.fitEllipse(cnt) bodyEllipse = ellipse bodyContour=cnt if bodyEllipse == None: #print "ERROR: cannot detect body ellipse in frame: ", framenumber imcopy = im.copy() cv2.putText(imcopy, "ERROR", (480,530), self.font, 1, (255,255,255), 3) try: self.wingData.loc[framenumber] = self.wingData.loc[framenumber-1]#[np.nan, np.nan, np.nan, np.nan, np.nan. np.nan] except: self.wingData.loc[framenumber] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, timestamp, np.nan] if self.saveImage == True: cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy) return timestamp, np.nan, np.nan, np.nan, np.nan, np.nan (f1, f2) = self.fociOfEllipse(bodyEllipse[0][0],bodyEllipse[0][1],bodyEllipse[1][0],bodyEllipse[1][1],bodyEllipse[2]) head = self.get_nearest(head, [f1,f2]) tail = self.get_furthest(head, [f1,f2]) centroid = (bodyEllipse[0][0],bodyEllipse[0][1]) backPoint = tuple(sum(y) / len(y) for y in zip(centroid, head)) backPoint = tuple(sum(y) / len(y) for y in zip(centroid, backPoint)) slope = self.convert_ellipseAngle_to_slope(bodyEllipse[2]) yint = -1.0*slope*bodyEllipse[0][0] + bodyEllipse[0][1] xint = (-1.0*yint / slope) axisLine = slope, yint, xint centroid = bodyEllipse[0] headLine = self.compute_perpendicular_from_points(head, centroid) midline = self.compute_perpendicular_from_points(centroid, head) tailLine = self.compute_perpendicular_from_points(tail, centroid) #FLIP BODY AXIS BASED ON PREVIOUS FRAMES ######################################################################################## try: if not self.previous_head_extended == None: #print framenumber, ': ', self.wingData.ix[framenumber-1].BodyAxis, bodyEllipse[2], np.cos(np.radians(self.wingData.ix[framenumber-1].BodyAxis - bodyEllipse[2])), '\t', self.flipped if not self.check_laterality(self.previous_head_extended, self.extend_vector(centroid,head), midline[0], midline[1], midline[2]): #debugging = True if self.flipped == 100: for x in range(-101,1): self.previous_head_extended = None self.detectWings(True, True, framenumber+x) self.flipped = 0 return head, tail = tail, head headLine, tailLine = tailLine, headLine backPoint = tuple(sum(y) / len(y) for y in zip(centroid, head)) backPoint = tuple(sum(y) / len(y) for y in zip(centroid, backPoint)) self.flipped += 1 else: self.flipped = 0 except: pass #print framenumber, ": Unable to assess body orientation." self.previous_head_extended = self.extend_vector(centroid,head) body_length = self.get_distance_between_coords(head,tail) abd_length = self.get_distance_between_coords(backPoint, tail) body_angle = self.angle_from_vertical(tail, head) if body_length >= 425: imcopy = im.copy() cv2.putText(imcopy, "ERROR", (480,530), self.font, 1, (255,255,255), 3) if self.saveImage == True: cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy) self.wingData.loc[framenumber] = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, timestamp, np.nan] return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, timestamp, np.nan WIDTH = bodyEllipse[1][0] ########################### TARGET TOUCH ############################################# if (bodyContour != None) & (targ_dist <= 5.0): imcopy = imgray.copy() bodymask = imcopy / imcopy - 1.0 #zeros, with dimensions of image. cv2.fillPoly(bodymask, [bodyContour],(255-imcopy.max())) bodymask = cv2.dilate(bodymask, kernel, iterations=4) imcopy = (imcopy + bodymask).astype(np.uint8) ret, trunk = cv2.threshold(imcopy, 80, 90, cv2.THRESH_TRUNC) targetContour, target_distance_TTM, approach_angle_TTM = self.get_targets(trunk, head, centroid, body_angle) else: targetContour, target_distance_TTM, approach_angle_TTM = [], np.nan, np.nan ############################# DEFINE WINGS ######################################################### wingTips, wholeWings, wingArea = [],[],[] wingTips, wholeWings, wingArea = self.get_candidate_wings(imgray, kernel, headLine, centroid, backPoint, body_length, abd_length, axisLine, wingTips, wholeWings, wingArea, timestamp_FMT, distance, targ_dist) polynomial = np.poly1d([ -15392.02683546, 29209.68050119, 3237.47165583]) wingSets = pd.DataFrame({'Tips':wingTips, 'Shape':wholeWings, 'Area':wingArea, 'Theta':np.empty(len(wingTips)).fill(0)}) wingSets['Theta'] = np.nan wingSets['Side'] = np.nan wingSets['Length'] = np.nan #wingSets.to_pickle('/groups/dickson/home/bathd/Desktop/wingsets.pickle') for x in np.arange(len(wingSets)): wingSets.loc[x,'Theta'] = self.compute_angle_given_three_points(backPoint, wingSets.loc[x,'Tips'], centroid) wingSets.loc[x,'Length'] = self.get_distance_between_coords(backPoint, wingSets.loc[x,'Tips']) wingSets.loc[wingSets['Theta'] >=np.pi,'Theta'] -= 2.0*np.pi wingSets.loc[wingSets['Theta'] <=-1.0*np.pi,'Theta'] += 2.0*np.pi wingSets.loc[wingSets['Theta'] < 0.0, 'Side'] = 'Right' wingSets.loc[wingSets['Theta'] >= 0.0, 'Side'] = 'Left' wingSets.loc[wingSets['Side'] == 'Right','Theta'] *= -1.0 #wingSets['polydif'] = (wingSets['Theta'] - polynomial(wingSets['Area'])) / (wingSets['Area']/20000.0) wingSets = wingSets[wingSets['Area'] > ((polynomial(wingSets['Theta'])/1.5) -2000.0*wingSets['Theta'] -5000.0)] wingSets = wingSets[wingSets['Area'] < ((polynomial(wingSets['Theta'])*1.3) +2000.0*wingSets['Theta'] + 3000.0)] wingSets = wingSets[(wingSets['Length'] >=250) & (wingSets['Length'] < 375)] wingSets = wingSets[(wingSets['Theta'] <= ((np.pi)/1.75) )] #wingSets = wingSets[(wingSets['polydif'] >= -0.8) & (wingSets['polydif'] <= 1.0)] wingSets = wingSets[(wingSets['Area'] >= 1000) & (wingSets['Area'] <= 30000)] try: #leftWing = wingSets.ix[wingSets[wingSets['Side']=='Left']['polydif'].abs().idxmin()] leftWing = wingSets.ix[wingSets[wingSets['Side']=='Left']['Area'].abs().idxmax()] except: leftWing = wingSets[0:0] leftWing.ix[0] = np.nan leftWing.set_value(0,'Tips',tuple(tail)) leftWing.set_value(0,'Shape',[[0,0]]) leftWing = leftWing.ix[0] self.total_errors += 1 try: #rightWing = wingSets.ix[wingSets[wingSets['Side']=='Right']['polydif'].abs().idxmin()] rightWing = wingSets.ix[wingSets[wingSets['Side']=='Right']['Area'].abs().idxmax()] except: rightWing = wingSets[0:0] rightWing.ix[0] = np.nan rightWing.set_value(0,'Tips',tuple(tail)) rightWing.set_value(0,'Shape',[[0,0]]) rightWing = rightWing.ix[0] self.total_errors +=1 if saveImage == True: imcopy = im.copy() try: cv2.drawContours(imcopy,[leftWing.Shape],0,(255,0,0),1) except: pass try: cv2.drawContours(imcopy,[rightWing.Shape],0,(0,255,255),1) except: pass try: cv2.drawContours(imcopy,[targetContour],0,(255,128,128),6) except: pass cv2.line(imcopy, (int(head[0]),int(head[1])), (int(tail[0]),int(tail[1])), (255,255,255), 1) cv2.line(imcopy, (int(backPoint[0]),int(backPoint[1])), (int(leftWing.Tips[0]),int(leftWing.Tips[1])), (20,20,255),2) cv2.line(imcopy, (int(backPoint[0]),int(backPoint[1])), (int(rightWing.Tips[0]),int(rightWing.Tips[1])), (20,255,20),2) cv2.circle(imcopy, (int(head[0]),int(head[1])), 3, (255,255,255), -1) cv2.circle(imcopy, (int(backPoint[0]),int(backPoint[1])), 5, (255,255,255), -1) #cv2.circle(imcopy, (int(centroid[0]),int(centroid[1])), 3, (255,0,255), -1) cv2.putText(imcopy, str(np.around(np.degrees(leftWing.Theta), 2))+ 'deg', (10,25), self.font, 1, (20,20,255), 3) cv2.putText(imcopy, str(np.around(rightWing.Area, 2)), (450, 65), self.font, 1, (20,255,20), 3) cv2.putText(imcopy, str(np.around(leftWing.Area, 2)), (10,65), self.font, 1, (20,20,255), 3) cv2.putText(imcopy, str(np.around(np.degrees(rightWing.Theta), 2))+ 'deg', (450, 25), self.font, 1, (20,255,20), 3) cv2.putText(imcopy, str(framenumber), (850, 25), self.font, 1, (255,255,255), 3) cv2.putText(imcopy, str(np.around(target_distance_TTM, 2)) + 'mm', (10,950), self.font, 1, (100,255,255), 3) cv2.putText(imcopy, str(np.around(approach_angle_TTM, 2)) + 'deg', (450, 950), self.font, 1, (100,255,255), 3) #cv2.putText(imcopy, timestring, (850, 950), self.font, 1, (255,255,255), 3) cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy) cv2.destroyAllWindows() #print framenumber, "\tL: ", ("%.2f" % np.degrees(leftWingAngle)), ("%.2f" % leftWingLength), '\tR: ', ("%.2f" % (-1.0*np.degrees(rightWingAngle))), ("%.2f" % rightWingLength), '\t',("%.2f" % distance), '\t', str(self.dTarget.asof(timestamp_FMT)), '\t', self.flipped self.wingData.loc[framenumber] = [body_angle, leftWing.Theta, leftWing.Length, body_length, rightWing.Theta, rightWing.Length, approach_angle_TTM, target_distance_TTM, timestamp, WIDTH] self.tracking_info.loc[framenumber] = [leftWing.Theta, leftWing.Area, rightWing.Theta, rightWing.Area, head[0], head[1], body_angle, centroid[0], centroid[1], target_distance_TTM, approach_angle_TTM] self.wingMetrics.loc[framenumber] = [leftWing.Area, leftWing.Length, leftWing.Theta, rightWing.Area, rightWing.Length, rightWing.Theta] if framenumber % 100 == 0: self.wingData.to_pickle(self.DEBUGGING_DIR + 'wingdata_cache.pickle') self.wingMetrics.to_pickle(self.DEBUGGING_DIR + 'wingMetrics_cache.pickle') return body_angle, leftWing.Length, leftWing.Theta, body_length, rightWing.Length, rightWing.Theta, timestamp, WIDTH def get_targets(self, fly_erased_img, headpoint, centroidpoint, _bodyAxis): kernel = np.ones((5,5),np.uint8) _, mask = cv2.threshold(fly_erased_img, 60, 255, cv2.THRESH_BINARY) mask = cv2.erode(mask, kernel, iterations=1) contourImage = mask.copy() contourImage = np.pad(contourImage,((2,2),(2,2)), mode='maximum') contours, hierarchy1 = cv2.findContours(contourImage, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) hierarchy = hierarchy1[0] for x in hierarchy: if x[3] <0: parent = x #headpoint = (int(track.loc[framenumber, 'c_head_location_x']), int(track.loc[framenumber, 'c_head_location_y'])) candidateTargets = [] for component in zip(contours, hierarchy): c = component[0] h = component[1] centroidCheck = cv2.pointPolygonTest(c,centroidpoint,True) if centroidCheck <=0: if np.array_equal(hierarchy[h[3]], parent) : #is in outer hierarchy (parent is edge.) if h[2] > 0: # has child (targets have inner and outer edge) if (cv2.contourArea(c) <= 150000) & (cv2.contourArea(c) >= 20000): ellipse = cv2.fitEllipse(c) if not self.pointInEllipse(centroidpoint[0],centroidpoint[1],ellipse[0][0],ellipse[0][1],ellipse[1][0],ellipse[1][1],ellipse[2]): candidateTargets.append(c) areas = [] if len(candidateTargets) >0: for T in range(len(candidateTargets)): areas.append(cv2.contourArea(candidateTargets[T])) TARGET = cv2.convexHull(candidateTargets[areas.index(max(areas))] ) M = cv2.moments(TARGET) targCentre = (int(M['m10']/M['m00']), int(M['m01']/M['m00'])) distance = -1.0*cv2.pointPolygonTest(TARGET,headpoint,True) / 135.5 # based on 135.5 pixels per mm angle= self.angle_from_vertical(headpoint, targCentre) approachAngle= angle - _bodyAxis #track.loc[framenumber, 'd_bodyAxis'] if approachAngle < 0: approachAngle *= -1.0 if approachAngle >=180.0: approachAngle -= 180.0 else: distance = np.nan approachAngle = np.nan TARGET = None return TARGET, distance, approachAngle def get_candidate_wings(self, imgray, kernel, headLine, centroid, backPoint, body_length, abd_length, axisLine, wingTips, wholeWings, wingArea,timestamp_FMT, distance, targ_dist): """ self.adjust_tracking_parameters = ((0,0,0),(0,0,0),(0,0,0)) bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist) paramchanges = [((0,0,0),(0,0,0),(0,0,0)), ((10,0,0),(10,0,0),(0,0,0)), ((-10,0,0),(-10,0,0),(0,0,0)), ((10,1,1),(10,0,-1),(0,0,0)), ((-10,1,1),(-10,0,-1),(0,0,0))] for p in paramchanges: self.adjust_tracking_parameters = p bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist) """ edge = self.get_edge(imgray) if edge > 115: wingThresh = int(0.75*edge + 10.0) bodyThresh = int(0.45*edge + 2.5) else: wingThresh = int(0.75*edge + 13.0) bodyThresh = int(0.55*edge + 2.5) if distance >= 170: adjustments = [-10,-5,0] else: adjustments = [-5,0,5] for a in adjustments: #DEFINE bodyNotWings AS BODY PORTION PLUS LEGS ETC, USEFUL FOR FINDING WINGS. ret1, bodyNotWings = cv2.threshold(imgray, bodyThresh,255,cv2.THRESH_BINARY) bodyNotWings = cv2.dilate(bodyNotWings, kernel, iterations=1) bodyNotWings = cv2.erode(bodyNotWings, kernel, iterations=1) #DEFINE wings AS WINGS AND TARGETS BUT NOT BODY. ret2, wings = cv2.threshold(imgray, wingThresh+a,1,cv2.THRESH_BINARY_INV) test = wings*bodyNotWings dilated = cv2.erode(test, kernel, iterations=2) #eroded = cv2.dilate(dilated, kernel, iterations=wingThresh[1]) #dilatedCopy = eroded.copy() wingCont, hierarchy = cv2.findContours(dilated, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) for c in wingCont: area = cv2.contourArea(c) #WINGS MUST BE APPROPRIATE SIZE if (area >= 3000): M = cv2.moments(c) cx, cy = int(M['m10']/M['m00']), int(M['m01']/M['m00']) #WINGS MUST BE BEHIND HEAD if self.check_laterality(centroid, (cx,cy), headLine[0], headLine[1], headLine[2]): checkSpot = (c[0][0][0], c[0][0][1]) pointSet1 = [] pointSet2 = [] pointSetTARGET = [] for x in c: if self.check_laterality((x[0][0], x[0][1]), centroid, headLine[0], headLine[1], headLine[2]): if self.check_laterality((x[0][0], x[0][1]), checkSpot, axisLine[0], axisLine[1], axisLine[2]): pointSet1.append(x.tolist()) else: pointSet2.append(x.tolist()) else: if targ_dist <=20.0: pointSetTARGET.append(x.tolist()) pointSet1 = np.array(pointSet1).reshape((-1,1,2)).astype(np.int32) pointSet2 = np.array(pointSet2).reshape((-1,1,2)).astype(np.int32) pointSetTARGET = np.array(pointSetTARGET).reshape((-1,1,2)).astype(np.int32) if (len(pointSet1) > 0): if cv2.contourArea(pointSet1) >=833:#(2500/(wingThresh[2]+1)): near, far = self.get_nearest_and_furthest_from_centroid(pointSet1, centroid) if self.get_distance_between_coords(near, centroid) <= 150: winglength = self.get_distance_between_coords(far, backPoint) if (winglength <= 2.0*(body_length)) and (winglength >= abd_length): wingTips.append(far) wholeWings.append(pointSet1)#(cv2.convexHull(pointSet1)) wingArea.append(cv2.contourArea(pointSet1)) if (len(pointSet2) > 0): if cv2.contourArea(pointSet2) >=833:#(2500/(wingThresh[2]+1)): near, far = self.get_nearest_and_furthest_from_centroid(pointSet2, centroid) if self.get_distance_between_coords(near, centroid) <= 150: winglength = self.get_distance_between_coords(far, backPoint) if (winglength <= 2.0*(body_length)) and (winglength >= abd_length): wingTips.append(far) wholeWings.append(pointSet2)#(cv2.convexHull(pointSet2)) wingArea.append(cv2.contourArea(pointSet2)) return wingTips, wholeWings, wingArea def closestpair(self, L): def square(x): return x*x def sqdist(p,q): return square(p[0]-q[0])+square(p[1]-q[1]) # Work around ridiculous Python inability to change variables in outer scopes # by storing a list "best", where best[0] = smallest sqdist found so far and # best[1] = pair of points giving that value of sqdist. Then best itself is never # changed, but its elements best[0] and best[1] can be. # # We use the pair L[0],L[1] as our initial guess at a small distance. best = [sqdist(L[0],L[1]), (L[0],L[1])] # check whether pair (p,q) forms a closer pair than one seen already def testpair(p,q): d = sqdist(p,q) if d < best[0]: best[0] = d best[1] = p,q # merge two sorted lists by y-coordinate def merge(A,B): i = 0 j = 0 while i < len(A) or j < len(B): if j >= len(B) or (i < len(A) and A[i][1] <= B[j][1]): yield A[i] i += 1 else: yield B[j] j += 1 # Find closest pair recursively; returns all points sorted by y coordinate def recur(L): if len(L) < 2: return L split = len(L)/2 splitx = L[split][0] L = list(merge(recur(L[:split]), recur(L[split:]))) # Find possible closest pair across split line # E = [p for p in L if abs(p[0]-splitx) < best[0]] for i in range(len(E)): for j in range(1,8): if i+j < len(E): testpair(E[i],E[i+j]) return L L.sort() recur(L) return best[1] def get_edge(self, frame): top = frame[0:5].mean() bottom = frame[-5:-1].mean() left = frame[:,0:5].mean() right = frame[:,-5:-1].mean() values = sorted([top, bottom, left, right])[1:] mean = sum(values) / 3.0 return mean def get_distance_between_coords(self, A, B): return np.sqrt((A[0]-B[0])**2 + (A[1]-B[1])**2) def get_nearest(self, POINT, list_of_points): nearest = 1000000000 for x in list_of_points: d = self.get_distance_between_coords(POINT, x) if d < nearest: nearest = d winner = x return winner def get_furthest(self, POINT, list_of_points): furthest = 0.0 for x in list_of_points: d = self.get_distance_between_coords(POINT, x) if d > furthest: furthest = d winner = x return winner def get_distance_from_body_ellipse(self, bodyCentroid, headPoint, POINT): perp_to_centroid = self.compute_perpendicular_from_points(bodyCentroid, headPoint) perp_x = bodyCentroid[0] + 10.0 perp_y = perp_to_centroid[0]*(bodyCentroid[0] + 10.0) + perp_to_centroid[1] perpPoint = (perp_x, perp_y) THETA = self.compute_angle_given_three_points(bodyCentroid, headPoint, perpPoint) POINT[0] = a*np.cos(THETA)*np.cos(t) - b*np.sin(THETA)*np.sin(t) POINT[1] = a*np.sin(THETA)*np.cos(t) + b*np.cos(THETA)*np.sin(t) pass #INCOMPLETE def get_centroid_and_head(self, _timestamp): centroid = (int(self.bagdf['Bx'].asof(_timestamp)),int(self.bagdf['By'].asof(_timestamp)))#[self.bagdf.Time >= _timestamp].iloc[0] head = (int(self.bagdf['Hx'].asof(_timestamp)),int(self.bagdf['Hy'].asof(_timestamp))) return centroid, head def get_tracking_thresholds(self, _timestamp, _distance, _dTarget): if _dTarget <= 4: vals = (65,1,1), (95,1,2), (35,1,1) elif _distance <=120: vals = (65,1,1), (95,1,2), (40,1,1) elif _distance <=150: vals = (65,1,1), (95,1,2), (40,1,1) elif _distance <=185: vals = (60,1,1), (80,1,2), (30,1,1) #(60,1,1), (80,1,2), (30,1,1) else: vals = (40,1,2), (65,1,2), (35,1,1) #(50,1,1), (79,1,2), (35,1,1) foo = self.add_nested_tuples(vals, self.adjust_tracking_parameters) return foo def add_nested_tuples(self, set1, set2): return tuple(map(lambda x, y: tuple(map(lambda w,z: w+z, x,y)), set1, set2)) def get_nearest_and_furthest_from_centroid(self, hullset, centroid): #PASS A SET OF POINTS DEFINING A SINGLE CONTOUR, IDEALLY OUTPUT FROM cv2.convexHull lowest_distance = 1000000 lowest_coords = (0,0) highest_distance = 0 highest_coords = (0,0) for a in hullset: b = (a[0][0], a[0][1]) distance = self.get_distance_between_coords(centroid, b) if distance > highest_distance: highest_coords = b highest_distance = distance if distance < lowest_distance: lowest_coords = b lowest_distance = distance return lowest_coords, highest_coords def compute_axis_from_points(self, POINT1, POINT2): if float(float(POINT1[0]) - float(POINT2[0]) ) == 0.0: XINT = POINT1[0] YINT = np.nan SLOPE = np.inf else: SLOPE = ( float(POINT1[1]) - float(POINT2[1])) / ( float(float(POINT1[0]) - float(POINT2[0]) )) YINT = POINT1[1] - (SLOPE*POINT1[0]) if abs(SLOPE) >= 1000000: XINT = POINT1[0] elif SLOPE == 0.0: XINT = np.nan else: XINT = -1*YINT / SLOPE return SLOPE, YINT, XINT def convert_ellipseAngle_to_slope(self, _degs): #OPENCV makes silly angles, where up is 0deg, and right is 90deg. degs = float(1.0*_degs + 90.0) return float(math.tan(math.radians(degs))) def pointOfIntersection(self, SLOPE1, YINT1, SLOPE2, YINT2): if float(SLOPE1 - SLOPE2) == 0.0: return else: px = float(YINT2 - YINT1) / float(SLOPE1 - SLOPE2) py = SLOPE1*px + float(YINT1) return (px, py) def pointInEllipse(self, x,y,xp,yp,d,D,angle): #tests if a point[xp,yp] is within #boundaries defined by the ellipse #of center[x,y], diameters d D, and tilted at angle cosa=math.cos(angle) sina=math.sin(angle) dd=d/2*d/2 DD=D/2*D/2 a =math.pow(cosa*(xp-x)+sina*(yp-y),2) b =math.pow(sina*(xp-x)-cosa*(yp-y),2) ellipse=(a/dd)+(b/DD) if ellipse <= 1: return True else: return False def fociOfEllipse(self, x,y,d,D,angle): #returns coordinates of foci #defined by the ellipse #of center[x,y], diameters d D, and tilted at angle cosa=math.cos(math.radians(angle-90.0)) sina=math.sin(math.radians(angle-90.0)) dd=d/2*d/2 DD=D/2*D/2 c = np.sqrt(DD-dd) slope = self.convert_ellipseAngle_to_slope(angle) c_x = cosa*c c_y = sina*c F1 = ((x+c_x),(y+c_y)) F2 = ((x-c_x),(y-c_y)) return (F1, F2) def compute_perpendicular_from_points(self, POINT1, POINT2): #perpendicular line through POINT1 if float(float(POINT1[1]) - float(POINT2[1]) ) == 0.0: XINT = np.nan YINT = POINT1[1] SLOPE = 0.0 else: SLOPE = -1.0*( float(POINT1[0]) - float(POINT2[0])) / ( float(float(POINT1[1]) - float(POINT2[1]) )) YINT = float(POINT1[1]) - (float(POINT1[0])*SLOPE) if abs(SLOPE) >= 1000000: XINT = POINT1[0] elif SLOPE == 0.0: XINT = np.nan else: XINT = -1.0*YINT / SLOPE return SLOPE, YINT, XINT def compute_angle_given_three_points(self, VERTEX, POINT1, POINT2): A = np.array(POINT1) B = np.array(VERTEX) C = np.array(POINT2) BA = A - B BC = C - B s = np.arctan2(*BA) e = np.arctan2(*BC) return e-s def check_laterality(self, POINT1, POINT2, SLOPE, YINT, XINT): #TRUE IF TWO POINTS ARE ON THE SAME SIDE OF THE LINE. if abs(SLOPE) == np.inf: SIGN = (POINT1[0]-XINT)*(POINT2[0]-XINT) #JUST COMPARE X VALUES TO X-INTERCEPT else: SIGN = (SLOPE*POINT1[0] + YINT - POINT1[1])*(SLOPE*POINT2[0] + YINT - POINT2[1]) if SIGN > 0: match = 1 elif SIGN <= 0: match = 0 return match def extend_vector(self, BACKPOINT, FRONTPOINT): delta_x, delta_y = (FRONTPOINT[0]-BACKPOINT[0]), (FRONTPOINT[1] - BACKPOINT[1]) new_x = FRONTPOINT[0] + delta_x/abs(delta_x)*1000 new_y = FRONTPOINT[1] + delta_y/abs(delta_y)*1000 return (new_x, new_y) def angle_from_vertical(self, point1, point2): """ RETURNS A VALUE IN DEGREES BETWEEN 0 AND 360, WHERE 0 AND 360 ARE NORTH ORIENTATION. """ x = point1[0] - point2[0] y = point1[1] - point2[1] return 180.0 + math.atan2(x,y)*180.0/np.pi
from pathlib import Path from itertools import chain, repeat from pandas import DataFrame from bs4 import BeautifulSoup from nltk.tokenize import word_tokenize corpus = DataFrame(columns=['is_negative', 'tokens']) for is_negative, review_path in chain( zip(repeat(0), Path('aclImdb/train/pos').iterdir()), zip(repeat(1), Path('aclImdb/train/neg').iterdir()), zip(repeat(0), Path('aclImdb/test/pos').iterdir()), zip(repeat(1), Path('aclImdb/test/neg').iterdir()), ): with review_path.open(encoding='UTF-8') as review_file: file = str(review_path.relative_to('aclImdb')) tokens = word_tokenize(BeautifulSoup(review_file.read()).text) corpus.loc[file] = is_negative, tokens print(len(corpus)) corpus.to_pickle('corpus.pkl')
class LearnObject: def __init__(self,FeatureObject,LabelsObject,LabelsObject2='notDefined'): self.FeaturesDF=FeatureObject.FeaturesDF self.LabelsObject=LabelsObject self.LabelsObject2=LabelsObject2 self.Details={'LabelDetails':LabelsObject.LabelingDetails,'stratifiedKFold':FeatureObject.details,'FeatureMethod':FeatureObject.method,'PieceLength':FeatureObject.details['PieceLength']} self.BestFeatures={} self.N=LabelsObject.N self.model='notDefined' class BestFeaturesForLabel(): #class of the best features for certain Labeling method (PatientsVsContols, mentalStatus, PANSS, etc.) def __init__(self,FeatureTypeList,LabelingList,n_features): self.df=DF(np.zeros([len(FeatureTypeList),n_features]),index=MultiIndex.from_tuples(FeatureTypeList),columns=range(n_features)) def add(self,bestNfeatures): #adds a feature to best features list (length n_features) BestFeaturesList=[j for j in bestNfeatures] FeatureTypeList=self.df.index for feature in FeatureTypeList: if feature in BestFeaturesList: isFeature=1 FeatureLoc=BestFeaturesList.index(feature) self.df.loc[feature][FeatureLoc] +=1 """def analyzeFeaturesWeight(BestFeaturesDF,weights,ByLevel=0): #after having n features, this analyzes the wheighted mean of the use in each feature type. df=BestFeaturesDF #N=df.sum().sum() dfSum=df.sum(level=ByLevel) self.Mean=dfSum.sum(axis=1) weights=self.weights#[1.0/(x+1) for x in df.columns] wSum=dfSum.mul(weights) wN=wSum.sum().sum() self.WeightedMean=wSum.sum(axis=1)/wN return WeightedMean""" #TODO -> add analysis according to facial part (according to excel..) #TODO - > add analysis according to learning weights (and not 0.1 : 0.9) def run(self,Model='svc',kernel='linear',is_cross_validation=True, cross_validationMethod='LOO', DecompositionMethod='PCA',decompositionLevel='FeatureType',n_components=30, FeatureSelection='TopExplainedVarianceComponents', n_features=10, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False,isSaveCsv=None, isSavePickle=None, isSaveFig=None,isSelectSubFeatures=False,SubFeatures='ExpressionLevel'): # -- TODO : # -- # Greedy selection on features + Other feature selection types... # -- # Make sure featuers are Best only based on train data!!! # -- # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration # -- # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015) # -- # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation # -- # add f feature analysis by facial part (see excel) # -- # select best model (svm, otherwise ridge regression) # -- # compare svc results with regerssion results (using LOO and different Params for regression - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) # -- # check how the model weights behave - feature selection analysis # -- # calc model error # -- # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided # -- # LOO - also on bool labels (patients vs controls and mental status bool) # -- # add mental status rank scores (0-4) # -- # make sure p-val returns the right value in 'scores' # -- # run it over random data (permutation test) # -- # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R) ## init if isSelectSubFeatures: print('Features : ' + SubFeatures) f=self.FeaturesDF.copy() featureNames=self.FeaturesDF.index.names try: f=f.loc[SubFeatures] f.index=MultiIndex.from_product([[SubFeatures],f.index], names=featureNames) except KeyError: f.index=f.index.swaplevel(0,1) f=f.loc[SubFeatures] f.index=MultiIndex.from_product([f.index,[SubFeatures]], names=featureNames) self.FeaturesDF=f.copy() else: SubFeatures='allFeatureTypes' FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)] self.FullResults=DF() # set learning params (cross validation method, and model for learning) isBoolLabel=self.LabelsObject.isBoolLabel isBoolScores=isBoolLabel if DecompositionMethod==None and (FeatureSelection == 'TopExplainedVarianceComponents' or FeatureSelection == 'TopNComponents'): print("ERROR- feature selection method cannot be '"+ FeatureSelection +"' when X is not decomposed") FeatureSelection=raw_input("Choose a different feature selection method ('RFE','f_regression','dPrime','AllFeatures'): ") model, isBoolModel= learningUtils.setModel(Model) selectFeatures =learningUtils.setFeatureSelection(FeatureSelection,n_features) n_components=min(n_features,n_features) #cannot have more components than features. decompositionTitle, decomposeFunction= learningUtils.setDecomposition(DecompositionMethod,n_components,decompositionLevel) isDecompose= decompositionTitle!='noDecomposition' # save learning params self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'Decomposition':decompositionTitle,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']} print('\n------------Learning Details------------') print(DF.from_dict(self.Learningdetails,orient='index')) print('\n----' + cross_validationMethod + ' Cross validation Results:----') #define global variables over modules (to be used in myUtils) globalVars.transformMargins=0#lambda x:x globalVars.isBoolLabel=isBoolLabel globalVars.isBoolModel=isBoolModel global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject) trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2) LabelingList=trainLabels_all.columns #['N1'] self.ResultsDF=DF() self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList isMultivarLabels=False LabelingIndex=enumerate(LabelingList) if isMultivarLabels: LabelingIndex=enumerate([LabelingList]) for label_ind, Labeling in LabelingIndex: """if isPerm: #TODO - fix this to work with continous / bool data try: trainLabels=self.LabelsObject.permedLabelsDF[Labeling] except AttributeError: self.LabelsObject.permLabels() trainLabels=self.LabelsObject.permedLabelsDF[Labeling]""" #set subjects list according to labels and features X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling) X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1) #init train and test labels trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all) trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2) #make sure only labeled subjects are used for classification X=X.query('subject == '+ str(list(trainLabels.index)) ) X.index.get_level_values(X.index.names[0]) SubjectIndex=list(set(X.index.get_level_values('subject'))) X2=X2.query('subject == '+ str(list(trainLabels2.index)) ) X2.index.get_level_values(X2.index.names[0]) SubjectIndex2=list(set(X2.index.get_level_values('subject'))) #init vars if isBetweenSubjects: cv_param=len(SubjectIndex) self.Learningdetails['CrossValSubjects']='between' isWithinSubjects=False else: isWithinSubjects=True X=X.swaplevel(0,1) PieceIndex=list(set(X.index.get_level_values('Piece_ind'))) cv_param=len(PieceIndex) self.Learningdetails['CrossValSubjects']='within' self.Learningdetails['NumOfFeatures']=n_features try: print('\n**' + Labeling + '**') except TypeError: print('\n*******') print(Labeling) cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) ## Learning - feature selection for different scoring types, with cross validation - BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis cv_ind=0 #used for transforming from margins returned from svm to continouse labels (e.g . PANSS) trainScores=DF() test_index=X.index testScores=concat([DF(index=test_index),DF(index=['std_train_err'])]) testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) testProbas=DF(index=X.index) testProbas2=DF(index=SubjectIndex) #impt=Imputer(missing_values='NaN', strategy='median', axis=0) globalVars.LabelRange=LabelRange ModelWeights1=DF(columns=range(len(cv)),index=X.columns) Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning ExplainedVar=DF(columns=range(len(cv))) ModelWeights2=DF(columns=range(len(cv))) bestNfeaturesPanel=Panel(items=LabelingList,minor_axis=range(len(cv)),major_axis=range(n_features)) #bestNfeaturesPanel=Panel(items=LabelingList,major_axis=range(len(cv)),minor_axis=MultiIndex.from_tuples(('a','b'))) for train, test in cv: if not is_cross_validation: train=np.append(train,test) #test=np.append(train,test) self.Learningdetails['CrossVal']='NONE' #if cv_ind>0: # break if isBetweenSubjects: #set X and Y train_subjects=trainLabels.iloc[train].index test_subjects=testLabels.iloc[test].index Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects) Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects) if isConcatTwoLabels: #used when there is more than one doctor Xtrain=concat([Xtrain,Xtrain2]) Xtest=concat([Xtest,Xtest2]) Ytrain=concat([Ytrain,Ytrain2]) YtrainTrue=concat([YtrainTrue,YtrainTrue2]) Ytest=concat([Ytest,Ytest2]) Xdropped=concat([Xdropped,Xdropped2]) SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2))) droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test. #select N best features: Xtrain, Xtest, bestNfeatures, components, explainedVar = learningUtils.decomposeAndSelectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeatures,decomposeFunction) BestFeaturesForLabel.add(bestNfeatures) #todo - delete this?? bestNfeaturesPanel[Labeling][cv_ind]=bestNfeatures """for feature_ind,feature_name in enumerate(bestNfeatures): try: bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind except KeyError: bestNfeaturesPanel[Labeling].columns=bestNfeaturesPanel[Labeling].columns.append(feature_name)#continue here!! use bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind [bestNfeatures].iloc[cv_ind]=range(len(bestNfeatures))""" #train 1 TrainModel=model TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index()) """try: #Components[cv_ind]=components.T #ExplainedVar[cv_ind]=explainedVar isDecompose=True""" if cv_ind==0: ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures))) ModelWeights1[cv_ind]=TrainModel.coef_.flatten() #get ROC scores without cross validation: #train 2 if isBoolLabel: PiecePrediction_train=DF(TrainModel.predict_proba(Xtrain).T[1],index=Xtrain.index,columns=['prediction']) TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1}) else: PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction']) TrainModel2=linear_model.LinearRegression() Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel) TrainModel2.fit(Xtrain2, Ytrain2) if cv_ind==0: ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns) ModelWeights2[cv_ind]=TrainModel2.coef_.flatten() #test 1 if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test if isDecompose: dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index) XtestDropped=dXdropped[bestNfeatures] YtestDropped=Series(XtestDropped.copy().icol(0)) #YTrueDropped=Series(Xdropped.copy().icol(0)) for subject in droppedSubjects: YtestDropped[subject]=testLabels_all[Labeling].loc[subject] #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject] Ytest=concat([Ytest,YtestDropped]).sort_index() Xtest=concat([Xtest,XtestDropped]).sort_index() if isPerm: #TODO- Check this!! Ytest=y_perms.loc[Ytest.index] Xtest=Xtest.fillna(0.) elif isWithinSubjects: #train 1 train_pieces=PieceIndex[train] test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces))) Ytrain=Series(index=X.index) Ytest=Series(index=X.index) YtrainTrue=Series(index=X.index) for subject in PieceIndex: for piece in train_pieces: Ytrain.loc[piece].loc[subject]=trainLabels[subject] YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] Ytest.loc[piece].loc[subject]=testLabels[subject] Ytrain=Ytrain.dropna() YtrainTrue=YtrainTrue.dropna() for subject in test_subjects: Ytest.loc[piece].loc[subject]=testLabels[subject] #train scores 1 if cv_ind==0: trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel) plt.figure(1) if len(LabelingList)>1: plt.subplot(round(len(LabelingList)/2),2,label_ind+1) if isBoolLabel: testScores,testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel) else: testScores[cv_ind],testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel) plt.title(Labeling,fontsize=10) else: plt.figure(3) new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel) trainScores=concat([trainScores,new_trainScores],axis=1) #test 1 testScores[cv_ind],testProbas_new=learningUtils.getTestScores(Ytest,Xtest,TrainModel) testProbas=concat([testProbas,testProbas_new]) #train2 if isBoolLabel: PiecePrediction_test=DF(TrainModel.predict_proba(Xtest).T[1],index=Xtest.index,columns=['prediction']) else: PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction']) Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test,isBoolLabel) if cv_ind==0: trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2) YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2 #plt.figure(1) #if len(LabelingList)>1: #plt.subplot(round(len(LabelingList)/2),2,label_ind+1) #test2 if isBoolLabel: testScores2,testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) else: testScores2[cv_ind],testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) #plt.title(Labeling,fontsize=10) else: new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2) YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2 trainScores2=concat([trainScores2,new_trainScores2],axis=1) if len(Xtest2)>0: # if there is more than one segment for subject testScores2[cv_ind],testProbas2_new=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2) testProbas2=concat([testProbas2,testProbas2_new]) cv_ind+=1 #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. fig2=plt.figure(2) if len(LabelingList)>1: plt.subplot(round(len(LabelingList)/2),2,label_ind+1) #if isAddDroppedSubjects: # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects] # else: # testLabelsSummary=testLabels scoresSummary,rocDF = learningUtils.getScoresSummary(trainScores2,testScores2,testProbas2,TrueLabels[Labeling]) # reset global vars globalVars.fitYscale='notDefined' globalVars.beta=DF() plt.title(Labeling,fontsize=10) plt.xlabel('Ytrue',fontsize=8) plt.ylabel('Ypredicted',fontsize=8) plt.tick_params(labelsize=6) #print(crossValScores.T) scores=scoresSummary.fillna(0.) #analyze feature weights ModelWeights1=ModelWeights1.dropna(how='all') WeightedFeatures1_index0=analysisUtils.getFeaturesWeights(0,bestNfeaturesPanel[Labeling],ModelWeights1) #FeatureAnalysisIndex=0 for featureType, 1= au's (if not decomposed) or component rank (if decomposed) WeightedFeatures1_index1=analysisUtils.getFeaturesWeights(1,bestNfeaturesPanel[Labeling],ModelWeights1) WeightedFeatures1=concat([DF(index=['-------(A) Index0-------']),WeightedFeatures1_index0,DF(index=['-------(B) Index1 -------']),WeightedFeatures1_index1]) WeightedFeatures2=DF(ModelWeights2.mean(axis=1)).fillna(0) #WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0) BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2]) self.BestFeatures[Labeling]=Series(BestFeatures.values.flatten(),index=BestFeatures.index) #analyze decomposition if isDecompose: Components_mean = Components.mean(axis=0) Components_std = Components.std(axis=0) normalize=lambda df:DF(StandardScaler().fit_transform(df.T).T,index=df.index,columns=df.columns) """#componentsMeanFeatureType=normalize(Components.mean(axis=1,level='FeatureType')) #componentsMeanFeatureTypeABS=normalize(componentsDF.abs().mean(axis=1,level='FeatureType')) #componentsMeanFSsignal=normalize(componentsDF.mean(axis=1,level='fs-signal')) #componentsMeanFSsignalABS=normalize(componentsDF.abs().mean(axis=1,level='fs-signal')) #ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check! #ExplainedVar_mean.index=['ExplainedVar_mean'] #ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check! #ExplainedVar_std.index=['ExplainedVar_std'] #componentsToCSV=concat([DF(index='---meanFeatureType----'),componentsMeanFeatureType,DF(index='---meanFeatureType - abs ----'),componentsMeanFeatureTypeABS,DF(index='---mean fs-signal ----'),componentsMeanFSsignal,DF(index='---mean fs-signal - abs ----'),componentsMeanFSsignalABS]) try: self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std]) except AttributeError: self.LabelComponents=dict.fromkeys(LabelingList) self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])""" """print(Components_mean) print(ExplainedVar_mean) print(WeightedFeatures1)""" #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff LabelFullResults=concat([DF(index=[Labeling]),scores]) self.FullResults=concat([self.FullResults,LabelFullResults]) self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1) #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png') testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score... FullSubjectsList=YpredictedOverAllLabels[0].columns YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all') YdroppNans=YdroppNans.dropna(axis=1,how='all') YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all') notNans_cv_ind=YpredictedOverAllLabels.items notNans_trainSubjects=YpredictedOverAllLabels.minor_axis notNans_LabelsList=YpredictedOverAllLabels.major_axis notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList] cv_ind=0 for train, test in cv: if cv_ind in notNans_cv_ind: print(test) train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects))) test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects))) if len(train)>0 and len(test)>0: AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train] AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0) AllLabelsYTrainTrue=notNans_TrueLabels[train] AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test] AllLabelsYTestTrue=notNans_TrueLabels[test] pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns) global AllLabelsTransformationMatrix AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!! TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix) #testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3) cv_ind+=1 self.BestNFeaturesAll=bestNfeaturesPanel self.ResultsDF=self.ResultsDF.fillna(0.) ## Print and save results print('\n') print(self.ResultsDF) print('\n') D=self.Learningdetails savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+ '_FSelection'+FeatureSelection+'_Decompostion'+D['Decomposition']+'PieceSize'+D['PieceLength']+'_'+SubFeatures if isPerm: savePath=savePath+'_PERMStest' saveName=savePath+'\\'+str(n_features)+'_features' self.Learningdetails['saveDir']=savePath dir=os.path.dirname(saveName) if not os.path.exists(dir): os.makedirs(dir) if isSavePickle is None: isSavePickle=int(raw_input('Save Results to pickle? ')) if isSaveCsv is None: isSaveCsv= int(raw_input('save Results to csv? ')) if isSaveFig is None: isSaveFig=int(raw_input('save Results to figure? ')) if isSavePickle: self.ResultsDF.to_pickle(saveName+'.pickle') self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle') if isSaveCsv: DetailsDF=DF.from_dict(self.Learningdetails,orient='index') ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures]) ResultsCSV.to_csv(saveName+'.csv') if isBoolLabel: ROCfig=learningUtils.save_plotROC(rocDF,isSave=True,saveName=saveName,title=SubFeatures) if isSaveCsv or isSavePickle: print('successfully saved as:\n' + saveName) if isSaveFig: plt.figure(1) plt.savefig(saveName + 'Train.png') plt.figure(2) plt.savefig(saveName + 'Test.png') plt.close() plt.close()
theta = 0.00001 softmax = False skip = True if not skip: game = pg.PredatorGame((0,0), (5,5), (11,11)) notused, rmse['Sarsa'] = sarsaresults(samples, episodes, discount, epsilon, alpha, initValue, softmax, theta) notused, rmse['Q-learning'] = qlearningresults(samples, episodes, discount, epsilon, alpha, initValue, softmax, theta) notused, rmse['Q-learning with SoftMax'] = qlearningsoftmaxresults(samples, episodes, discount, tau, alpha, initValue, softmax, theta) notused, notused, rmse['On Policy Monte Carlo'] = montecarloOnPolicyresults(samples, episodes, discount, epsilon, 0, theta) notused, notused, notused, rmse['Off Policy Monte Carlo'] = montecarloOffPolicyresults(samples, episodes, discount, epsilon, 0, theta) rmse['episode'] = range(0,episodes) dataF = DataFrame(rmse) dataF.to_pickle('data/rmse'+str(episodes)) else: dataF = pd.read_pickle('data/rmse'+str(episodes)) episodeData = pd.melt(dataF, id_vars=['episode'], var_name='Learning algorithm') # for key, value in rmse.items(): # plt.figure() # plt.plot(value, 'b') # plt.xlabel('Episodes') # plt.ylabel('Root Mean Square Error ('+key+')') # plt.legend() # plt.show() p = ggplot(episodeData, aes('episode', 'value', color='Learning algorithm')) +\ geom_line() +\
N = 1 if not skip: for i in range(N): print i averageQ, predwinsratioQ = getIndependentQLearning() averageS, predwinsratioS = getIndependentSarsa() data['IndependentQLearning'] = predwinsratioQ data['IndependentSarsa'] = predwinsratioS data['episode'] = range(1,episodes+1) dataF = DataFrame(data) dataF.to_pickle('data/comparison') else: dataF = pd.read_pickle('data/comparison') for a in alg: dataF[a] = scipy.ndimage.filters.gaussian_filter(dataF[a],5*(episodes/4000),0) episodeData = pd.melt(dataF, id_vars=['episode'], var_name='Algorithm') p = ggplot(episodeData, aes('episode', 'value', color='Algorithm')) +\ geom_line() +\ theme_bw() + theme() + ylab("Win ratio") + xlab("Episodes") print p ggsave(p, "plots/comparison.png") ggsave(p, "plots/comparison.pdf")
class ZopeRequestPlotter(object): def __init__(self, requests): self.requests = requests self.df = DataFrame(self.requests) self.df.to_pickle('data_frame.pickle') #Normalize Timestamp to hours min_timestamp_row = self.df.ix[self.df['timestamp'].idxmin()] min_timestamp_value = min_timestamp_row['timestamp'] self.min_timestamp = min_timestamp_row['timestamp_text'] self.max_timestamp = self.df.ix[self.df['timestamp'].idxmax()]['timestamp_text'] self.df['timestamp'] = (self.df['timestamp'] - min_timestamp_value) / 3600 def plot_call_summary(self): """ """ # Gets the most expensive 1000 calls top_n_expensive_calls = self.df.sort_index(by='elapsed', ascending=False)[:1000] calls = [ call for call, group in top_n_expensive_calls.groupby('call') ] data = [] for call in calls: call_info = self.df[self.df['call']==call]['elapsed'] call_data = {} call_data['call'] = call call_data['mean'] = call_info.mean() call_data['count'] = call_info.count() call_data['max'] = call_info.max() call_data['min'] = call_info.min() data.append(call_data) call_data_df = DataFrame(data) fig = plt.figure() fig.suptitle('{0} - {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16) ax = fig.add_subplot(3, 1, 1) #ax.get_xaxis().set_visible(False) self.df[self.df['elapsed']>=5].sort_index(by='timestamp').plot(title='Response time > 5 seconds', ax=ax, x='timestamp', y='elapsed') ax = fig.add_subplot(3, 1, 2) call_data_df[['call', 'min', 'max', 'mean']].set_index('call').plot(title='Response Time', ax=ax, kind='barh') ax = fig.add_subplot(3, 1, 3) call_data_df.plot(title='Call Count', ax=ax, x='call', y='count', kind='barh') fig.show() self.plot_calls_distribution(calls) def plot_calls_distribution(self, calls_to_plot): # plots call distribution for 2 calls #calls_to_plot = [ 'EventsRouter.query', 'EventsRouter.queryArchive', 'MessagingRouter.setBrowserState' ] #calls_to_plot = [ 'IncidentManagementRouter.runNotification', 'IncidentManagementRouter.associateIncident', 'EventsRouter.queryArchive' ] fig = plt.figure() fig.suptitle('{0} - {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16) graph_rows = 4 graph_cols = len(calls_to_plot)/graph_rows if len(calls_to_plot)%graph_rows != 0: graph_cols = graph_cols + 1 plot_n = 1 for call in calls_to_plot: data = self.df[self.df['call']==call] ax = fig.add_subplot(graph_rows, graph_cols, plot_n) data.plot(title=call, ax=ax, x='timestamp', y='elapsed', style='.', fontsize=10) plot_n = plot_n + 1 fig.show() def plot_user_call_data(self, fmean=False, fcount=False): function = None if fmean: function = mean else: function = count if function: fig = plt.figure() # call analysis (mean) graph_rows = 3 graph_cols = 1 plot_n = 1 # top 3 users data top_users = users_call_count.index[:3] for top_user in top_users: #top_user = users_call_count.index[0] top_user_calls = self.df[ self.df.user == top_user ] ax = fig.add_subplot(graph_rows, graph_cols, plot_n) top_user_calls_count = top_user_calls.groupby('call').call.function() top_user_calls_count.sort() top_user_calls_count.plot(title='Call count for {0}'.format(top_user), ax = ax, kind='barh') plot_n = plot_n + 1 fig.show() def plot_user_data(self): # call analysis per user (count) users_call_count = self.df.groupby('user')['timestamp'].count() users_call_count.sort(ascending=False) count_fig = plt.figure() count_fig.suptitle('{0} - {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16) graph_rows = 4 graph_cols = 1 plot_n = 1 ax = count_fig.add_subplot(graph_rows, graph_cols, plot_n) users_call_count[:10].plot(title='Top 10 users. Number of calls', ax = ax, kind='barh') plot_n = plot_n + 1 top_users = users_call_count.index[:3] for top_user in top_users: top_user_calls = self.df[ self.df.user == top_user ] ax = count_fig.add_subplot(graph_rows, graph_cols, plot_n) top_user_calls_count = top_user_calls.groupby('call').call.count() top_user_calls_count.sort() top_user_calls_count.plot(title='Call count for {0}'.format(top_user), ax = ax, kind='barh') plot_n = plot_n + 1 count_fig.show() # call analysis per user (mean) mean_fig = plt.figure() mean_fig.suptitle('{0} - {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16) users_call_mean = self.df.groupby('user')['elapsed'].mean() users_call_mean.sort(ascending=False) graph_rows = 4 graph_cols = 1 plot_n = 1 ax = mean_fig.add_subplot(graph_rows, graph_cols, plot_n) users_call_mean[:10].plot(title='Top 10 users: mean elapsed time per call', ax = ax, kind='barh') plot_n = plot_n + 1 # top 3 users data top_users = users_call_mean.index[:3] for top_user in top_users: top_user_calls = self.df[ self.df.user == top_user ] ax = mean_fig.add_subplot(graph_rows, graph_cols, plot_n) top_user_calls_count = top_user_calls.groupby('call').elapsed.mean() top_user_calls_count.sort() top_user_calls_count.plot(title='Call mean for {0}'.format(top_user), ax = ax, kind='barh') plot_n = plot_n + 1 mean_fig.show() def plot_archive_calls(self): archive_calls = self.df[ self.df.call == 'EventsRouter.queryArchive' ] archive_calls_count = archive_calls.groupby('user')['elapsed'].count() archive_calls_count.sort(ascending=False) archive_fig = plt.figure() archive_fig.suptitle('{0} - {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16) graph_rows = 2 graph_cols = 1 plot_n = 1 # Archive call count per user ax = archive_fig.add_subplot(graph_rows, graph_cols, plot_n) archive_calls_count.plot(title='Archive call count per user', ax = ax, kind='barh') plot_n = plot_n + 1 ''' # Archive call mean per user ax = archive_fig.add_subplot(graph_rows, graph_cols, plot_n) archive_calls_mean = archive_calls.groupby('user')['elapsed'].mean() archive_calls_mean.plot(title='Archive call mean elapsed time per user', ax = ax, kind='barh') plot_n = plot_n + 1 ''' # Archive call distribution for user with more calls to archive user_pegging_archive = archive_calls_count.index[0] pegger_df = archive_calls[archive_calls.user=='zec'][['elapsed', 'timestamp']] pegger_df.sort_index(by='timestamp') ax = archive_fig.add_subplot(graph_rows, graph_cols, plot_n) pegger_df.plot(title='Top archive user call distribution vs elapsed time', ax=ax, x='timestamp', y='elapsed', style='.', fontsize=10) archive_fig.show() def plot_zec_user_calls(self): zec_calls = self.df[ self.df.user == 'zec' ] zec_calls_count = zec_calls.groupby('call')['elapsed'].count() zec_calls_count.sort(ascending=False) # Call count fig = plt.figure() fig.suptitle('{0} - {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16) ax = fig.add_subplot(2, 1, 1) zec_calls_count.plot(title='Zec User calls', ax = ax, kind='barh') # Call Distribution data_to_plot = DataFrame() for call, group in zec_calls.groupby('call'): data_to_plot = data_to_plot.append(group[['call','elapsed','timestamp']]) ax = fig.add_subplot(2, 1, 2) data_to_plot.plot(title='Zec user call distribution vs elapsed time', ax=ax, x='timestamp', y='elapsed', style='.', fontsize=10) fig.show() def plot_requests_info(self): self.plot_call_summary() self.plot_user_data() #self.plot_archive_calls() self.plot_zec_user_calls()
for epsilon in parametersFor(category): results[epsilon] = getResults(samples, episodes, discount, epsilon, decay) elif category == 'decay': for decay in parametersFor(category): results[decay] = getResults(samples, episodes, discount, epsilon, decay) elif category == 'discount': for discount in parametersFor(category): print(discount) results[discount] = getResults(samples, episodes, discount, epsilon, decay) else: sys.exit() print(results) results['episode'] = range(1,episodes+1) dataF = DataFrame(results) dataF.to_pickle('data/'+str(episodes)+category+"small") #pickle.dump(randomReturnValues, open('data/values'+str(episodes)+category+str(softmax), 'w+')) else: dataF = pd.read_pickle('data/'+str(episodes)+category) #randomReturnValues = pickle.load(open('data/values'+str(episodes)+category+str(softmax), 'r+')) print dataF if smoothing: for par in parametersFor(category): dataF[par] = scipy.ndimage.filters.gaussian_filter(dataF[par],5*(episodes/4000),0) episodeData = pd.melt(dataF, id_vars=['episode'], var_name=category) ylabel = "Steps" p = ggplot(episodeData, aes('episode', 'value', color=category)) +\ geom_line(alpha=0.6) +\
if category == "epsilon": for epsilon in [0.05, 0.1, 0.3, 0.9]: results[epsilon], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax) elif category == "alpha": for alpha in [0.1, 0.2, 0.3, 0.6, 1]: results[alpha], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax) elif category == "discount": for discount in [0.1, 0.4, 0.7, 0.8, 0.9]: print discount results[discount], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax) else: sys.exit() print results results["episode"] = range(0, episodes) dataF = DataFrame(results) dataF.to_pickle("data/" + category + str(softmax)) else: dataF = pd.read_pickle("data/" + category + str(softmax)) episodeData = pd.melt(dataF, id_vars=["episode"], var_name=category) p = ( ggplot(episodeData, aes("episode", "value", color=category)) + geom_line() + theme_bw() + theme() + ylab("Steps") + xlab("Episodes") + ylim(0, 60) )
"avgICDMPaperCount", "maxICDMPaperCount", "primaryICDMPaperCount", "maxConnectivity", "maxPageRank", "maxDegCentrality", "numAuthors" ] for i in toLog: il = i+"Log" df[il] = df[i] df.loc[df[il] == 0, il] = .1 df.loc[:, il] = np.log(df.loc[:, il]) df.to_pickle("savedFrames/predictionFeatures/paperTable") print "Constructing Review Table" i = 0 reviewTable = [] for id, review in loader.reviews.iteritems(): paper = review.paper reviewer = review.user reviewTable.append({ "paperId": paper.id, "userId": reviewer.id, "rating": review.overallRating,
def process_matebook_data(directory, paramlist, storage_location): vidname = parse_screen_filename(directory) for filename in find_files(directory, 'track.tsv'): vidpath, flyID = parse_filename(filename) tag = vidname + "_" + flyID if not os.path.exists(storage_location + '/' + tag + '_arena.pickle'): fi = pd.read_table(filename, sep='\t', header = [0,1], skiprows=[2,3]) tempdf = DataFrame(index = fi.index) if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2: print "arena dropped for poor quality: ", tag continue elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0: print "arena dropped because quality = 1: ", tag continue elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <=1: print "arena dropped because courtship = nan: ", tag continue else: for j in paramlist: tempdf[j[1]] = fi[j[0],j[1]] if 'movedAbs_u' in j: tempdf[j[1]] = tempdf[j[1]] * FPS tempdf['Time'] = tempdf.index/FPS time_ID = vidpath.split('_',1)[-1].split('.',1)[0] tempdf = merge_jvision_data(tempdf.reset_index(), time_ID) tempdf.to_pickle(storage_location + '/'+ tag + '_arena.pickle') print ".....", tag, " processed to pickling." return