Example #1
0
def clean_file(filename):
    print("{0}/{1}: {2}".format(
        files.index(filename) + 1, len(files), filename))

    with open(filename, 'r') as f:
        lines = f.readlines()

    all_data = []

    for line in lines:
        data = json.loads(line)
        data_clean = {}
        data_clean['c'] = data['countrycode']
        data_clean['w'] = data['word']
        data_clean['s_id'] = int(data['key_id'])
        data_clean['r'] = data['recognized']
        data_clean['d'] = []
        for drawing in data['drawing']:
            drawing = np.array(drawing)
            drawing[:, 1:] = drawing[:, 1:] - drawing[:, :-1]
            data_clean['d'].append(drawing.tolist())

        all_data.append(data_clean)

    df = DataFrame(all_data)
    new_filename = (os.path.split(filename)[1].split('.')[0] + '.p')
    new_filename = new_filename.replace(' ', '_').lower()
    df.to_pickle(os.path.join(DATA_DIR, 'clean', new_filename))
Example #2
0
 def load(data_frame: DataFrame, file_path: str) -> None:
     """Load (save) the data to the file system.
     :param data_frame: DataFrame to write.
     :param file_path: File to write to.
     """
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     data_frame.to_pickle(file_path, compression='gzip')
Example #3
0
def save_weather_df(weather_df: pd.DataFrame, output_proc_file: str = None):
    if output_proc_file is None:
        output_proc_file = 'weather_data.pkl'
    path_to_output = os.path.join(PROC_DATA_DIR, output_proc_file)
    logging.info(
        "saving weather dataframe as pickle in {f}".format(f=output_proc_file))
    weather_df.to_pickle(path_to_output)
Example #4
0
def _write_df(df: pd.DataFrame, fpath: str, **kwargs):
    print("Writing dataframe '{}'".format(fpath))
    ext = os.path.splitext(fpath)[-1]
    if ext == ".h5":
        info = create_file_info(fpath)
        store = pd.HDFStore(fpath)

        try:
            store.put(HDF_NAMESPACE,
                      df,
                      format='table',
                      data_columns=list(df.columns))
        except IndexError:
            print(
                f"WARNING: Somehow this file could not be saved in a clean way. Trying dirty way."
            )
            store.put(HDF_NAMESPACE, df, format='table')
        try:
            store.get_storer(HDF_NAMESPACE).attrs.metadata = info
        except KeyError:
            print(f"WARNING: You might write empty data to {fpath}.")
        store.close()
        return
    if ext == ".csv":
        df.to_csv(fpath, **kwargs)
        return
    if ext == ".pickle":
        df.to_pickle(fpath, **kwargs)
        return
    if ext == "":
        raise Exception("No file extension was provided.")
    raise Exception("No writer for: " + ext)
Example #5
0
    def test_to_csv_with_dst_transitions_with_pickle(self):
        # GH11619
        idx = date_range("2015-01-01",
                         "2015-12-31",
                         freq="H",
                         tz="Europe/Paris")
        idx = idx._with_freq(None)  # freq does not round-trip
        idx._data._freq = None  # otherwise there is trouble on unpickle
        df = DataFrame({"values": 1, "idx": idx}, index=idx)
        with tm.ensure_clean("csv_date_format_with_dst") as path:
            df.to_csv(path, index=True)
            result = read_csv(path, index_col=0)
            result.index = to_datetime(result.index,
                                       utc=True).tz_convert("Europe/Paris")
            result["idx"] = to_datetime(
                result["idx"], utc=True).astype("datetime64[ns, Europe/Paris]")
            tm.assert_frame_equal(result, df)

        # assert working
        df.astype(str)

        with tm.ensure_clean("csv_date_format_with_dst") as path:
            df.to_pickle(path)
            result = pd.read_pickle(path)
            tm.assert_frame_equal(result, df)
Example #6
0
 def upload_dataframe(self, df: pd.DataFrame, blob_path:str):
     temp_path = 'df.pkl'
     try:
         df.to_pickle('df.pkl')
         self.upload_blob(blob_path, temp_path)
     finally:
         os.remove(temp_path)
Example #7
0
 def __to_pkl(data: pd.DataFrame, path_to_pkl_base: str,
              path_to_pkl_augmented: str, pid: str):
     data.to_pickle(os.path.join(path_to_pkl_base, pid + ".pkl"))
     for i, chunk in enumerate(np.array_split(data, 4)):
         chunk.to_pickle(
             os.path.join(path_to_pkl_augmented,
                          pid + "-" + str(i) + ".pkl"))
Example #8
0
def calculate_routeplan(graph, point_from, point_to, path_weight_func='avg_cost'):
    # find all path from all nodes to all nodes and save result in table with summary  
    # таблица путей по нарпвлениям
   
    pairs = [pair for pair in product(point_from, point_to)]
    pairs_matrix = []
    for pair in pairs:
        try:
            # TODO: think about find all shortest path and choice optimal from all variable table
            # TODO: think about find alternative path with limit by capcity on edges
            path = dijkstra_path(graph, pair[0], pair[1], weight=path_weight_func)
                       
            row = []
            row.append(pair)
            row.append(path)
            row.append(path_amount(graph, path, 'time'))
            row.append(path_amount(graph, path, 'dist'))
            #row.append(path_amount(graph, path, 'cost'))
            pairs_matrix.append(row)
        except Exception as e:
            print(e)

    route_plan = DataFrame(pairs_matrix, columns=['pair', 'path', 'path_time', 'path_dist']) 
    route_plan['edges'] =  route_plan['path'].apply(edges_from_path)
    route_plan[['from', 'to']] = route_plan.pair.apply(lambda row: Series(row))
    
    logging.info('Route path found. Start saving...')
    # план направлений
    date = datetime.today().strftime('%Y-%m-%d_%H:%M')
    path = '../result/'
    file_name = path + 'route_table_' + date
    route_plan.to_pickle(file_name)
    return route_plan
Example #9
0
    def generate_metafile(self, metafile_path):
        DATABASE_DIR = self.dataset_dir
        IMAGE_META = 'Data/AllImages_release.mat'
        MOS_META = 'Data/AllMOS_release.mat'
        STD_META = 'Data/AllStdDev_release.mat'
        img_names = loadmat(join(DATABASE_DIR,
                                 IMAGE_META))['AllImages_release']
        img_names = list(map(lambda_0, img_names))
        img_types = list(map(lambda_1, img_names))
        img_pathes = [
            join('Images', item) if item[0] != 't' else join(
                'Images', 'trainingImages', item) for item in img_names
        ]
        img_dummy_refs = [
            'dummy_' + ''.join(item.split('.')[:-1]) for item in img_names
        ]

        mos = loadmat(join(DATABASE_DIR,
                           MOS_META))['AllMOS_release'].squeeze().tolist()
        std = loadmat(join(DATABASE_DIR,
                           STD_META))['AllStdDev_release'].squeeze().tolist()
        dataframe = DataFrame()
        dataframe['DIS_PATH'] = img_pathes
        dataframe['REF_PATH'] = img_dummy_refs
        dataframe['REF'] = img_dummy_refs
        dataframe['INDEX'] = mos
        dataframe['TYPE'] = img_types
        dataframe['STD'] = std
        dataframe.to_pickle(metafile_path)
Example #10
0
def test_pickle_options(fsspectest):
    df = DataFrame({"a": [0]})
    df.to_pickle("testmem://afile", storage_options={"test": "pickle_write"})
    assert fsspectest.test[0] == "pickle_write"
    out = read_pickle("testmem://afile", storage_options={"test": "pickle_read"})
    assert fsspectest.test[0] == "pickle_read"
    tm.assert_frame_equal(df, out)
def nice_charact(results_table1, **kwargs):
	"""Returns nicely formatted DataFrame with process characteristics calculated from experimental data"""
	miu_high = estiamte_miu_high(results_table1['h'].values, results_table1['X'].values)
	Yxs = estiamte_Yxs(results_table1['X'].values, results_table1['S'].values)
	Yps = estiamte_Yxs(results_table1['P'].values, results_table1['S'].values)

	data_ch = DataFrame({'Parameters':['miu_high', 'Yxs', 'Yps']})
	data_ch[kwargs['expno']] = [miu_high, Yxs, Yps]

	if 'path' in kwargs:
		# print("Saving requested...")
		if os.path.isfile(kwargs['path']+'.pickle'):
			# print("File exists...")
			all_data_ch = pd.read_pickle(kwargs['path']+'.pickle')
			all_data_ch[kwargs['expno']] = 0
			all_data_ch.drop(kwargs['expno'], axis=1, inplace=True)
			all_data_ch = all_data_ch.merge(data_ch, on='Parameters')
			all_data_ch.to_pickle(kwargs['path']+'.pickle')
			all_data_ch.to_html(kwargs['path']+'.html')
			# print("File saved...")
			return all_data_ch
		else:
			print("File for characteristics does not exist...creating a new one.")
			data_ch.to_pickle(kwargs['path']+'.pickle')
			data_ch.to_html(kwargs['path']+'.html')
			# print("New file saved...")
	# print("Saving none...")
	return data_ch
Example #12
0
    def run(self):
        import pickle
        from pandas import DataFrame

        self.output().makedirs()

        with self.input().open('r') as f:
            rosters = pickle.load(f)

        cleaned_rosters = []

        for team_id, roster in rosters.items():
            goalie_ids = [player['person']['id']
                          for player in roster
                          if player['position']['code'] == 'G'
                          ]

            cleaned_rosters.extend([(team_id, self.season, goalie_id)
                                    for goalie_id in goalie_ids])

        cleaned_rosters = DataFrame(cleaned_rosters)
        cleaned_rosters.columns = ['team_id', 'season', 'goalie_id']

        with self.output().temporary_path() as temp_output_path:
            cleaned_rosters.to_pickle(temp_output_path, compression=None)
Example #13
0
def process_matebook_data(directory, paramlist, storage_location):
    vidname = parse_screen_filename(directory)
    for filename in find_files(directory, 'track.tsv'):
        vidpath, flyID = parse_filename(filename)
        tag = vidname + "_" + flyID
        if not os.path.exists(storage_location + '/' + tag + '_arena.pickle'):
            fi = pd.read_table(filename,
                               sep='\t',
                               header=[0, 1],
                               skiprows=[2, 3])
            tempdf = DataFrame(index=fi.index)
            if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2:
                print "arena dropped for poor quality: ", tag
                continue
            elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0:
                print "arena dropped because quality = 1: ", tag
                continue
            elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <= 1:
                print "arena dropped because courtship = nan: ", tag
                continue
            else:
                for j in paramlist:
                    tempdf[j[1]] = fi[j[0], j[1]]
                    if 'movedAbs_u' in j:
                        tempdf[j[1]] = tempdf[j[1]] * FPS
            tempdf['Time'] = tempdf.index / FPS
            time_ID = vidpath.split('_', 1)[-1].split('.', 1)[0]
            tempdf = merge_jvision_data(tempdf.reset_index(), time_ID)
            tempdf.to_pickle(storage_location + '/' + tag + '_arena.pickle')
            print ".....", tag, " processed to pickling."
    return
Example #14
0
    def save_ticker_data(self,
                         ticker : str,
                         data_source : str, # ie - IB, MW ...
                         data : pd.DataFrame,
                         data_type : str):
        """
        @ ticker: - ticker name
        @ data_source - (str) represetnt where the data came from ie - IB = interactive brokers, MW = marketwatch ...
        @ data - the data it self
        @ data_type : (str) - the postfix of the file - pkl, csv ...

        """

        # if no dir for ticker create
        tickerdir_path = "./" + TICKERS_DIR_NAME + "/" + ticker
        file_name = ticker +  FILENAME_PREFIXES[data_source]
        df = data
        if (ticker not in os.listdir(os.getcwd() + "/" + TICKERS_DIR_NAME)):
            os.mkdir(tickerdir_path)

        file_path = tickerdir_path + "/" + file_name + "." + data_type
        if (data_type == "pkl"):
            data.to_pickle(file_path)

        elif (data_type == "csv"):
            data.to_csv(file_path)

        print("saved {}.{} at: {}".format(file_name, data_type, tickerdir_path))
    def run(self):
        import pickle
        from pandas import DataFrame

        self.output().makedirs()

        with self.input()[0].open('r') as f:
            songs = pickle.load(f)

        songs = [song['track'] for song in songs]

        song_data = [(song['id'], song['name'], song['artists'][0]['id'], 'US'
                      in song['available_markets'], song['duration_ms'],
                      song['explicit'], song['uri'], song['preview_url'])
                     for song in songs]

        song_data_df = DataFrame(song_data,
                                 columns=[
                                     'id', 'name', 'main_artist',
                                     'available_in_us', 'duration_ms',
                                     'explicit', 'uri', 'preview_url'
                                 ])

        with self.output().temporary_path() as temp_path:
            song_data_df.to_pickle(temp_path, compression=None)
Example #16
0
 def store(self, name: str, data: pd.DataFrame):
     """Adds named dataframe to collection and stores its contents on disk."""
     if name in self._table_ids:
         raise TableExists(f'Table {name} already present in the DFC.')
     with self._create_file(name) as fd:
         data.to_pickle(fd)
         self._table_ids[name] = self._instance_id
Example #17
0
def write(df: pd.DataFrame, path: str, **kwargs) -> None:
    """Read file to DataFrame by file's extension.

    Args:
        df (DataFrame): DataFrame to write to disk.
        path (str): Path to write the file to. Supported file suffixes are:
            - csv
            - pkl (pickle)
            - hdf (HDF5)
            - dta (Stata)
        **kwargs: Arbitrary keyword arguments to pass to the ``pandas`` write
            method.
    Returns:
        None:
    """

    file_type = path.split('.')[-1]

    if file_type == 'csv':
        df.to_csv(path, **kwargs)
    elif file_type in PICKLE_EXT:
        df.to_pickle(path, **kwargs)
    elif file_type in HDF5_EXT:
        mode = kwargs.pop('mode', 'w')
        df.to_hdf(path, 'df', mode=mode, **kwargs)
    elif file_type == 'dta':
        df.to_stata(path, **kwargs)
    else:
        err_str = 'File type {} is yet not supported.'
        raise NotImplementedError(err_str.format(file_type))
Example #18
0
def gather_data(filelist):
    datadf = DataFrame()

    intvals = np.array([0, 200, 2000, 20000])  #6310
    for x in filelist:
        FLY_ID = x.split('/')[-1].split('_fly.')[0]
        EXP_ID, DATE, TIME = FLY_ID.split('_', 4)[0:3]
        fx = pd.read_pickle(x)
        fx = fx[fx.columns]
        try:
            number_of_bouts, bout_duration, first_TS, last_TS = utilities.detect_stim_bouts(
                fx, 'Laser2_state')
        except:
            number_of_bouts = 1

        stim_duration = find_nearest(intvals, fx['stim_duration'][0])
        PC_wing = fx[
            (fx.index >= pd.to_datetime(THRESH_ON * NANOSECONDS_PER_SECOND))
            & (fx.index <= pd.to_datetime(THRESH_OFF * NANOSECONDS_PER_SECOND)
               )]['maxWingAngle']
        WEI = float(PC_wing[PC_wing >= 0.524].count()) / float(PC_wing.count())
        if WEI < WEI_THRESHOLD:
            print FLY_ID, " excluded from analysis, with wing extension index: ", WEI, "."
            continue

        fx['group'] = str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms'
        print str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms'
        fx['FlyID'] = FLY_ID
        datadf = pd.concat([datadf, fx])
    datadf.to_csv(JAABA + HANDLE + '_rawdata_' + binsize + '.csv', sep=',')
    datadf.to_pickle(JAABA + 'JAR/' + HANDLE + '_rawdata_' + binsize +
                     '.pickle')
Example #19
0
def store_data(df: pd.DataFrame, station: str):
    out_file = Path().absolute() / "downloads" / f"{station}_hourly_discharge.p"
    #if not out_file.parent.is_dir():
    #    out_file.parent.mkdir(parents=True)
    #arr = xarray.Dataset.from_dataframe(df)
    #arr.to_netcdf(out_file)
    df.to_pickle(out_file, compression='gzip')
Example #20
0
def process_data(filename, paramlist):
    fi = pd.read_table(filename, sep='\t', header=[0, 1], skiprows=[2, 3])
    tempdf = DataFrame(index=fi.index)
    vidname, flyID = parse_filename(filename)
    tag = vidname + "_" + flyID
    if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2:
        print "arena dropped for poor quality: ", tag
        return
    elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0:
        print "arena dropped because quality = 1: ", tag
        return
    elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <= 1:
        print "arena dropped because courtship = nan: ", tag
        return
    else:
        for j in paramlist:
            tempdf[j[1]] = fi[j[0], j[1]]
            if 'movedAbs_u' in j:
                tempdf[j[1]] = tempdf[j[1]] * FPS
            if 'copulating' not in j:
                pass  #tempdf[j[1]][fi['0', 'copulating'] == 1] = np.nan
    tempdf['Time'] = tempdf.index / FPS
    tempdf.to_pickle(JAR + tag + '_tempdf.pickle')
    print ".....", tag, "processed to pickling."
    return
def process_data(filename, paramlist):
    fi = pd.read_table(filename, sep='\t', header = [0,1], skiprows=[2,3])
    tempdf = DataFrame(index = fi.index)
    vidname, flyID = parse_filename(filename)
    tag = vidname + "_" + flyID
    if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2:
        print "arena dropped for poor quality: ", tag
        return
    elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0:
        print "arena dropped because quality = 1: ", tag
        return
    elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <=1:
        print "arena dropped because courtship = nan: ", tag
        return
    else:
        for j in paramlist:
            tempdf[j[1]] = fi[j[0],j[1]]
            if 'movedAbs_u' in j:
                tempdf[j[1]] = tempdf[j[1]] * FPS
            if 'copulating' not in j:
                pass#tempdf[j[1]][fi['0', 'copulating'] == 1] = np.nan
    tempdf['Time'] = tempdf.index/FPS
    tempdf.to_pickle(JAR + tag + '_tempdf.pickle')
    print ".....", tag, "processed to pickling."
    return 
Example #22
0
def save_experiment_scorings(output_path: Path, method_id: str,
                             scorings: pd.DataFrame):
    scorings_path = output_path / "scorings"
    # making sure that the folder is there
    scorings_path.mkdir(parents=True, exist_ok=True)
    scorings_filepath = scorings_path / (method_id + ".pickled")
    scorings.to_pickle(scorings_filepath)
Example #23
0
def create_df(db='parking.min.db', save_as='parking.df.pickle'):
    conn = sqlite3.connect(db)
    rows = conn.execute('''select updated, park_id, free_places
                        from parking_min''').fetchall()
    ids = list(set([t[1] for t in rows]))
    data = {}
    for x in ids:
        dates = [np.datetime64(r[0], 's')
                 for r in rows if r[1] == x]   # updated
        y = [r[2] for r in rows if r[1] == x]  # free_places (target)
        data[x] = Series(y, index=dates)

    # convert data to DataFrame
    df = DataFrame(data)
    # get the names
    nr = conn.execute('''SELECT DISTINCT name
                      FROM parking ORDER BY park_id''').fetchall()
    # replace non ascii chars
    names = [unicodedata.normalize('NFKD', x[0]).encode('ascii', 'ignore')
             for x in nr]
    # remove dots
    names = [x.replace(u'.', '') for x in names]
    # assign to columns
    df.columns = names

    # destroy where there all are NaNs
    df = df[pd.notnull(df).any(axis=1)]

    # save
    if save_as is not None:
        df.to_pickle(save_as)

    return df
Example #24
0
    def run(self):
        import pickle
        from pandas import DataFrame

        self.output().makedirs()

        with self.input().open('r') as f:
            teams = pickle.load(f)

        cleaned_teams = []

        for team in teams:
            # skip teams that aren't active anymore
            if not team['active']:
                continue

            team_id = team['id']
            team_name = team['name']
            team_shortname = team['abbreviation']
            cleaned_teams.append((team_id,
                                  team_name,
                                  team_shortname,
                                  ))

        cleaned_teams = DataFrame(cleaned_teams)
        cleaned_teams.columns = ['team_id', 'team_name', 'team_shortname']

        with self.output().temporary_path() as temp_output_path:
            cleaned_teams.to_pickle(temp_output_path, compression=None)
Example #25
0
def save_pickle(df: pd.DataFrame, out:Path) -> None:
    m = df.loc[0, "method"]
    p = Path(out / "raw" / re.sub(" ", "_", m[:-3]))
    if not p.is_dir(): p.mkdir(parents=True)
    f = p / f"{re.sub(' ', '_', m)}.pkl"
    df.to_pickle(f)
    print(f"Saved results to file at: {str(f)}")
 def exportQuesAcceptedAns(self):
     file = "../../data/Ques-AcceptedAnswers"
     data = DataFrame([self.QuesId, self.UserId, self.AcceptedAnswerId],
                      index=['Question', 'Questioner', 'AnswerId'])
     data = data.T
     data['Accepted'] = 1
     data.to_pickle(file + '.data')
Example #27
0
def calculate_nodes_cost(graph, nodes_load):
    # calculate nodes cost timeseries
    # each cell of pdataframe contain ts (1 hour sampling) with loads
    # we need calculate costs for procecing this volumes 
    
    logging.info('Get graph edges for gettin time...')
    graph_edges = DataFrame(graph.edges(data=True), columns = ['from', 'to', 'info'])
    graph_edges['edge'] = Series(zip(graph_edges['from'], graph_edges['to']))
    graph_edges['time'] = graph_edges['info'].apply(lambda row: get_value(row, 'time')).replace(inf, nan) # return inf for inf edges
    graph_edges['dist'] = graph_edges['info'].apply(lambda row: get_value(row, 'dist')).replace(inf, nan) # return inf for inf edges
    graph_edges['type'] = graph_edges['info'].apply(lambda row: get_value(row, 'type')).replace(inf, nan) # return inf for inf edges
    # DONE: change  'cost' to ' avg_cost' prev use not correct data control change
    graph_edges['avg_cost'] = graph_edges['info'].apply(lambda row: get_value(row, 'avg_cost')).replace(inf, nan)
    # convert time in seconds to hours
    graph_edges['time'] = graph_edges['time'].apply(lambda row: ceil(row/3600))
       
    graph_edges = graph_edges[['edge', 'info', 'time', 'dist','type', 'avg_cost']]
    
    
    logging.info('Start calculating costs...')
    result = []
    for edge in nodes_load['edge'].unique():
        # get dataframe with volumes
        total_cost = nodes_load[nodes_load['edge'] == edge]['total_volumes'].iloc[0].resample('D').sum()
        dist = graph_edges[graph_edges['edge'] == edge]['dist'].iloc[0]
        time = graph_edges[graph_edges['edge'] == edge]['time'].iloc[0]
        edge_type = graph_edges[graph_edges['edge'] == edge]['type'].iloc[0]
        if edge_type == 'auto':
            # FIXME: !!!!!!calculation need initialize of avg_cost in frist iterration
            total_cost['cost'] = total_cost['sum_mass_kg']*graph_edges[graph_edges['edge'] == edge]['avg_cost'].iloc[0]
            total_cost[['total_cost', 'avg_loads', 'min_capacity', 'amount', 'types_list']] = \
                    total_cost.apply(lambda row: \
                                        Series(minimal_auto_cost_func(dist, time, row['sum_mass_kg'])), axis=1)
        
        if edge_type == 'avia':
            total_cost['cost'] = total_cost['sum_mass_kg']*graph_edges[graph_edges['edge'] == edge]['avg_cost'].iloc[0]
            total_cost[['total_cost', 'avg_loads', 'min_capacity', 'amount', 'types_list']] = \
                    total_cost.apply(lambda row: \
                                        Series(avia_cost(time, row['sum_mass_kg'])), axis=1)
        
        if edge_type == 'sort_center':
            total_cost['cost'] = total_cost['sum_mass_kg']*graph_edges[graph_edges['edge'] == edge]['avg_cost'].iloc[0]
            total_cost[['total_cost', 'avg_loads', 'min_capacity', 'amount', 'types_list']] = \
                    total_cost.apply(lambda row: \
                                        Series(sort_center_cost(time, row['sum_mass_kg'])), axis=1)
        
        
        result.append((edge, total_cost))
        
    nodes_cost = DataFrame(result, columns=['edge', 'total_cost'])
           
    logging.info('Edge cost calculated. Start saving...')
    date = datetime.today().strftime('%Y-%m-%d_%H:%M')
    path = '../result/'
    file_name = path + 'nodes_cost_' + date
    nodes_cost.to_pickle(file_name)
    logging.info('Edge cost calculated. Start saving...')
    file_name = path + 'graph_edges_' + date
    graph_edges.to_pickle(file_name)
    return nodes_cost
Example #28
0
def gather_data(filelist):
    datadf = DataFrame()
    
    intvals = np.array([0, 200, 2000, 20000]) #6310
    for x in filelist:
        FLY_ID = x.split('/')[-1].split('_fly.')[0]
        EXP_ID, DATE, TIME = FLY_ID.split('_', 4)[0:3]
        fx = pd.read_pickle(x)
        fx = fx[fx.columns]
        try:
            number_of_bouts, bout_duration, first_TS, last_TS = utilities.detect_stim_bouts(fx, 'Laser2_state')
        except:
            number_of_bouts = 1
        
        stim_duration = find_nearest(intvals, fx['stim_duration'][0])
        PC_wing = fx[(fx.index >= pd.to_datetime(THRESH_ON*NANOSECONDS_PER_SECOND)) & (fx.index <= pd.to_datetime(THRESH_OFF*NANOSECONDS_PER_SECOND))]['maxWingAngle']
        WEI = float(PC_wing[PC_wing >= 0.524].count()) / float(PC_wing.count())
        if WEI < WEI_THRESHOLD:
            print FLY_ID, " excluded from analysis, with wing extension index: " , WEI , "."
            continue
            
        
        fx['group'] = str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms'
        print str(number_of_bouts) + 'x_' + str(stim_duration) + 'ms'
        fx['FlyID'] = FLY_ID
        datadf = pd.concat([datadf, fx])
    datadf.to_csv(JAABA + HANDLE + '_rawdata_' + binsize + '.csv', sep=',')
    datadf.to_pickle(JAABA + 'JAR/'+ HANDLE + '_rawdata_' + binsize + '.pickle')
Example #29
0
def get_all_artist_lyrics(artist_url):
    df = DataFrame(np.empty(0, dtype=[('artist_url', object),
                                      ('album_url', object),
                                      ('song_url', object),
                                      ('lyrics', object)]))
    album_urls = get_album_urls(artist_url)

    print "found album urls " + ", ".join(album_urls)
    row_num = 0
    for album_url in album_urls:
        try:
            song_urls = get_song_urls(album_url)
        except:
            print "failed to get songs for " + album_url
            continue
        finally:
            print "got songs for " + album_url
        for song_url in song_urls:
            try:
                lyrics = extract_lyrics(song_url)
            except:
                print "failed to get lyrics for " + song_url
            finally:
                print "got lyrics for " + song_url
                df.loc[row_num] = [artist_url, album_url, song_url, lyrics]
                row_num += 1

    # todo save line by line
    df.to_pickle(artist_url.split("/")[-1] + '.pkl')
    return df
Example #30
0
def append_and_save(OG: pd.DataFrame, new: pd.DataFrame, path=PICKLE_PATH):
    # with open('replacements.csv', 'r') as replacements_file:
    #     l = [x.strip().split(',') for x in replacements_file.readlines()]
    # replacements = {line[0]: line[1] for line in l}
    # # Drop the columns we really don't care about
    # new.drop(BLACKLIST, axis=1, errors='ignore', inplace=True)

    # for from_name, to_name in replacements.items():
    #     if from_name in new.columns and to_name in new.columns:
    #         new[to_name].where(new[to_name].notnull(), new[from_name], inplace=True)
    #         new.drop(columns=[from_name], axis=1, inplace=True)
    #
    #     elif from_name in new.columns:
    #         new.rename(columns={from_name: to_name}, inplace=True)
    #         # new.drop(columns=[from_name], axis=1)
    # new = new.applymap(lambda x: x if type(x) is not str else x.lower().strip().replace(', ', ','))
    # print('Saving, but this wont work for different websites')

    if len(OG) > 0:
        OG = OG.append(new, ignore_index=True, sort=True)
        # OG.drop(BLACKLIST, axis=1, errors='ignore', inplace=True)
        # OG = OG.apply(clean_column)
        OG.to_pickle(path)
        return OG
    else:
        # new = new.apply(clean_column)
        new.to_pickle(path)
        return new
Example #31
0
    def writeLog(
            self,
            sender_id: int,
            log_df: pd.DataFrame,
            filename: Optional[Union[str, PathLike, Path]] = None) -> None:
        # Called by any agent, usually at the very end of the simulation just before
        # kernel shutdown, to write to disk any log dataframe it has been accumulating
        # during simulation. The format can be decided by the agent, although changes
        # will require a special tool to read and parse the logs. The Kernel places
        # the log in a unique directory per run, with one filename per agent, also
        # decided by the Kernel using agent type, id, etc.

        # If there are too many agents, placing all these files in a directory might
        # be unfortunate. Also if there are too many agents, or if the logs are too
        # large, memory could become an issue. In this case, we might have to take
        # a speed hit to write logs incrementally.

        # If filename is not None, it will be used as the filename. Otherwise,
        # the Kernel will construct a filename based on the name of the Agent
        # requesting log archival.

        if self.skip_log:
            return

        path = joinpath("..", "log", self.log_dir)
        file = f"{filename or self.agents[sender_id].name.replace(' ', '')}.bz2"

        makedirs(path, exist_ok=True)
        log_df.to_pickle(joinpath(path, file), compression='bz2')
Example #32
0
def make_test_dataset(pkl_filepath):
    """Creates a synthetic classification dataset to use for testing.

    Dataset is a pandas DataFrame written to pickle at the given path.
    """
    # Create a synthetic classification dataset
    X, y = make_classification(
        n_samples=100,
        # 5 features, 1 will be pure noise
        n_features=5,
        n_informative=3,
        n_redundant=1,
        n_repeated=0,
        # Assign 10% of labels at random to add noise
        flip_y=0.1,
        shuffle=False,
        random_state=543,
    )

    df = DataFrame(X, columns=["a", "b", "c", "d", "e"])
    # Convert one of the informative columns to categorical (string):
    # Round values to integer and map integers to letters
    to_categ = df["c"].astype("int")
    to_categ_uniq = sorted(to_categ.unique())
    categ = to_categ.map(
        dict(zip(to_categ_uniq, list(ascii_uppercase[:len(to_categ_uniq)]))))
    df["c"] = categ
    # Append the label column
    df["label"] = y
    df.to_pickle(pkl_filepath)
Example #33
0
def export_dataframe(df: pd.DataFrame, workdir: str, name: str):
    logger.info(f"{name}:\n{df}")
    df.to_latex(os.path.join(workdir, f"{name}.tex"))
    with open(os.path.join(workdir, f"{name}.txt"), "w") as f:
        df.to_string(f)
    with open(os.path.join(workdir, f"{name}.csv"), "w") as f:
        df.to_csv(f)
    df.to_pickle(os.path.join(workdir, f"{name}.pickle"))
Example #34
0
def write_frame(
    frame: pd.DataFrame,
    base_path: PathLike,
    name: str
) -> None:
    frame.to_pickle(os.path.join(os.path.join(base_path, name + '.pkl')))
    with open(os.path.join(os.path.join(base_path, name + '.csv')), 'w+') as handle:
        frame.to_csv(handle)
Example #35
0
 def add_to_cache(self, parameter_id: str, dt_range: DateTimeRange,
                  df: pds.DataFrame):
     fname = self.data_folder + '/' + str(uuid.uuid4())
     if df is not None:
         df.to_pickle(fname)
         self.cache.add_entry(parameter_id, CacheEntry(dt_range, fname))
     else:
         self.cache.add_entry(parameter_id, CacheEntry(dt_range, None))
Example #36
0
def gather_data(filelist):
    datadf = DataFrame()
    for x in filelist:
        FLY_ID, FMF_TIME, GROUP = parse_fmftime(x)
        fx = pd.read_pickle(x)
        rel = fx[['Laser_state', 'maxWingAngle', 'Length', 'Width']]
        rel['group'] = GROUP
        rel['FlyID'] = FLY_ID
        datadf = pd.concat([datadf, rel])
    datadf.to_csv(JAABA + 'rawdata_' + binsize + '.csv', sep=',')
    datadf.to_pickle(JAABA + 'JAR/rawdata_' + binsize + '.pickle')
def preeditimage(input_file, output_dir, params):
    """
    Segment the specified grayscale images, and save the binary image to file.
    First, clean the image by removing the background and filtering it, then 
    find the edges and threshold it to convert it to a binary image. Extract 
    and verify the data from this image.

    args:
        input_file (file): input directory of raw data
        output_dir (path): output directory to save file
        params (dict): input parameters

    """

    # Do not overwrite existing output
    output_file = os.path.join(output_dir, os.path.basename(input_file))
    if os.path.isfile(output_file):
        img = imread(output_file)
    else:
        # Segment the grayscale image and save to file
        img = segment.main(imread(input_file), params['segment'])
        imsave(output_file, img)

    print ' - segment: ' + time.asctime()

    # Do not overwrite existing output
    output_file2 = os.path.splitext(output_file)[0] + '.pickle'
    if os.path.isfile(output_file2):
        return

    # Extract properties from the labeled image and save as a DataFrame
    data = extract.preedit(img, params['extract'])
    columns = ('Area', 'BoundingBox', 'Centroid', 'EdgeSpline', 'FourierFit',
               'Length', 'MidSpline', 'Perimeter', 'StalkedPole', 'SwarmerPole')

    f = read.getframenum(input_file, params['segment']['pattern'])
    if data:
        # Make MultiIndex with frame and label info
        j = [f] * len(data)
        k = [v['Label'] for v in data]
    else:
        # Create empty DataFrame
        data = [dict.fromkeys(columns, np.nan)]
        j = [f]
        k = [-1]
    index = MultiIndex.from_arrays((j, k), names=('Frame', 'Label'))
    df = DataFrame(data, columns=columns, index=index)
    verify.preedit(df, params['verify'])
    df.to_pickle(output_file2)

    print ' - extract: ' + time.asctime()
Example #38
0
def pagetest2():
    import numpy as np
    from matplotlib import pyplot as plt
    from pandas import Series, DataFrame
    import pandas as pd
    from io import StringIO

    df = DataFrame(np.random.rand(6,4), index=["One", "Two", "Three", "Four", "Five", "Six"], columns=pd.Index(["A", "B", "C", "D"], name="Genus"))
    buf = StringIO()
    df.to_pickle(buf)
    response = make_response(buf.getvalue())
    response.headers['Content-Type'] = 'Image/png'
    #response.headers['Content-Type'] = 'text/html;charset=utf8'
    return response
def compile_data(files):
    print 'compiling...'
    rawfile = DataFrame({'Time':[]})
    dflist = []
    vidlist = []
    flyIDlist = []
    for x in files:
        tempdf = pd.read_pickle(x)
        dflist.append(tempdf)
        vidname, flyID = parse_tempdf_name(x)
        vidlist.append(vidname)
        flyIDlist.append(flyID)
    rawfile = pd.concat(dflist, keys=zip(vidlist,flyIDlist), names=['Video','Arena'])
    rawfile.to_csv(OUTPUT + 'rawfile.csv', sep=',')
    rawfile.to_pickle(JAR + 'rawfile.pickle')
    return rawfile
Example #40
0
class Pickle(BaseIO):

    def setup(self):
        self.fname = '__test__.pkl'
        N = 100000
        C = 5
        self.df = DataFrame(np.random.randn(N, C),
                            columns=['float{}'.format(i) for i in range(C)],
                            index=date_range('20000101', periods=N, freq='H'))
        self.df['object'] = tm.makeStringIndex(N)
        self.df.to_pickle(self.fname)

    def time_read_pickle(self):
        read_pickle(self.fname)

    def time_write_pickle(self):
        self.df.to_pickle(self.fname)
Example #41
0
def compile_data(pickle_jar):
    print 'compiling...'
    rawfile = DataFrame({'Time':[]})
    dflist = []
    vidlist = []
    flyIDlist = []
    for x in glob.glob(pickle_jar + '/*arena.pickle'):
        tempdf = pd.read_pickle(x)
        dflist.append(tempdf)
        vidname, flyID = parse_tempdf_name(x)
        vidlist.append(vidname)
        flyIDlist.append(flyID)
    rawfile = pd.concat(dflist, keys=flyIDlist, names=['Arena'])
    rawfile = rawfile.reset_index()
    #rawfile.to_csv(OUTPUT + 'rawfile.csv', sep=',')
    rawfile.to_pickle(pickle_jar + '/' + vidname + '_compiled.pickle')
    return rawfile
Example #42
0
def gather_data(filelist):
    datadf = DataFrame()
    for x in filelist:
        print x
        FLY_ID = x.split('/')[-1].split('_fly.')[0]
        EXP_ID, DATE, TIME = FLY_ID.split('_', 4)[0:3]
        fx = pd.read_pickle(x)
        fx = fx[fx.columns]
        PC_wing = fx[(fx.index >= pd.to_datetime(THRESH_ON*NANOSECONDS_PER_SECOND)) & (fx.index <= pd.to_datetime(THRESH_OFF*NANOSECONDS_PER_SECOND))]['maxWingAngle']
        WEI = float(PC_wing[PC_wing >= 0.524].count()) / float(PC_wing.count())
        if WEI < WEI_THRESHOLD:
            print FLY_ID, " excluded from analysis, with wing extension index: " , WEI , "."
            continue
        fx['group'] = EXP_ID
        fx['FlyID'] = FLY_ID
        datadf = pd.concat([datadf, fx])
    datadf.to_csv(JAABA + HANDLE + '_rawdata_' + binsize + '.csv', sep=',')
    datadf.to_pickle(JAABA + 'JAR/'+ HANDLE + '_rawdata_' + binsize + '.pickle')
Example #43
0
    def test_to_csv_with_dst_transitions(self):

        with ensure_clean('csv_date_format_with_dst') as path:
            # make sure we are not failing on transitions
            times = pd.date_range("2013-10-26 23:00", "2013-10-27 01:00",
                                  tz="Europe/London",
                                  freq="H",
                                  ambiguous='infer')

            for i in [times, times + pd.Timedelta('10s')]:
                time_range = np.array(range(len(i)), dtype='int64')
                df = DataFrame({'A': time_range}, index=i)
                df.to_csv(path, index=True)

                # we have to reconvert the index as we
                # don't parse the tz's
                result = read_csv(path, index_col=0)
                result.index = to_datetime(result.index).tz_localize(
                    'UTC').tz_convert('Europe/London')
                assert_frame_equal(result, df)

        # GH11619
        idx = pd.date_range('2015-01-01', '2015-12-31',
                            freq='H', tz='Europe/Paris')
        df = DataFrame({'values': 1, 'idx': idx},
                       index=idx)
        with ensure_clean('csv_date_format_with_dst') as path:
            df.to_csv(path, index=True)
            result = read_csv(path, index_col=0)
            result.index = to_datetime(result.index).tz_localize(
                'UTC').tz_convert('Europe/Paris')
            result['idx'] = to_datetime(result['idx']).astype(
                'datetime64[ns, Europe/Paris]')
            assert_frame_equal(result, df)

        # assert working
        df.astype(str)

        with ensure_clean('csv_date_format_with_dst') as path:
            df.to_pickle(path)
            result = pd.read_pickle(path)
            assert_frame_equal(result, df)
            # save the intensity plot:
            E_out = f_E_out()
            E2_out = f_E2_out()
            ax.imshow(E2_out[300:400,300:400], vmin=0, vmax=1, **plot_args)
            ax.set_title('Intensity');
            ax2.imshow(E_out[0][300:400,300:400], vmin=-1, vmax=1, **plot_args)
            ax2.set_title('Re(E)');
            fig_name = os.path.join(plotdir, '{n:06d}.png'.format(n=n))
            plt.savefig(fig_name)
            
        if n % update_frequency == 0:
            # also renormalise the update rate:
            phi_rate_avg = np.mean(np.abs(f_phi_updates()))
            l_rate = np.min([update_rate_target / phi_rate_avg, 1.2*l_rate])  # can go up by 20% at the most.
            updates = ((slmOpt.phi, slmOpt.phi - l_rate * slmOpt.phi_rate),
                       (slmOpt.phi_rate, momentum*slmOpt.phi_rate + (1.-momentum)*grad))
            update = theano.function([], 
                                    cost, 
                                    updates=updates,
                                    on_unused_input='warn')
    
    print 'Finished gradient descent, saving summary.'
    # create and save the dataframe with the learning curves:
    df = DataFrame({'Cost_SE': l_cost_SE,
                    'Cost_QE': l_cost_QE,
                    'Mean_update': l_mean_update,
                    'Max_update': l_max_update})
    df.to_pickle(os.path.join(outputdir, 'summary.pkl'))
    
    sys.exit()
Example #45
0
def authorization(request):
    client = Client()
    code = request.GET['code']
    access_token = client.exchange_code_for_token(client_id=MY_STRAVA_CLIENT_ID, client_secret=MY_STRAVA_CLIENT_SECRET, code=code)   
    
    # making a global variable to be used across views. don't know how this will work in practice
    
    client = Client(access_token=access_token)
    athlete = client.get_athlete() # Get current athlete details
    
    global athleteId 
    athleteId = athlete.id
    
    # if athlete doesn't exist, add them
    if len(Athlete.objects.filter(athleteId=athleteId)) == 0:
        ath = Athlete.objects.create(name=str(athlete.firstname+' '+athlete.lastname), athleteId=athleteId, profilePic=athlete.profile, city=athlete.city, country=athlete.country, sex=athlete.sex, premium=athlete.premium, created_at=athlete.created_at, updated_at=athlete.updated_at, followers=athlete.follower_count, friends=athlete.friend_count, email=athlete.email, weight=athlete.weight, meas_pref=athlete.measurement_preference, runsSummary = DataFrame({}).to_json(orient='records'), fitLines = DataFrame({}).to_json(orient='records'), masterList = DataFrame({}).to_json(orient='records'))

        ath.profilePic.name = "rudyzPic"
        ath.save(update_fields=['profilePic'])
 
    # if athlete already exists, draw their file
    elif len(Athlete.objects.filter(athleteId=athleteId)) == 1:
        ath = Athlete.objects.get(athleteId=athleteId)
           
    ############################################ 
    ##### compiling new runs, updating summary
        
    # athlete's existing runs summary   
    existingSummary = DataFrame(pd.read_json(ath.runsSummary))
    existingFitlines = DataFrame(pd.read_json(ath.fitLines)) 
    masterList = DataFrame(pd.read_json(ath.masterList))
     
    activities = list(client.get_activities()) 
    
    # activity IDs of runs already in the system
    try:
        ids = existingSummary.activityId
    except AttributeError:
        ids = []
         
    for i in range(len(activities)):   
    #for i in range(30,37):
        # Ignoring activities already in the system 
        if (len(ids) == 0) or (float(activities[i].id) not in list(ids)):
            
            try:
                # compiling df for raw json-ization
                activityId = activities[i].id
                run = client.get_activity_streams(activityId, types=['time','latlng','distance','heartrate','altitude','cadence'])
                latlng = run['latlng'].data
                time = run['time'].data
                distance = run['distance'].data
                heartrate = run['heartrate'].data
                altitude = run['altitude'].data
                cadence = run['cadence'].data
                date = activities[i].start_date_local 
                activity = activityId   
                dfi = thresher.assemble(date, activityId, heartrate, distance, time, altitude, latlng, cadence) 
                
                
                # basic cleanup, only removing totally unreasonable values
                dfi = thresher.basicClean(dfi)


                # if we ever want to try our hand at improving strava's speed data (ie by predicting speed when GPS blanks), intervene here:
                    
                #dfi = thresher.addDistDeltas(dfi)
                             
                                        
                try: 
                    fitline = thresher.getFitlineLws(dfi) # this adds speed-shifted columns
                except:
                    fitline = pd.DataFrame({})
                    
                try:
                    mafScore = fitline[fitline.hr == 140.0].avgSpeed.iloc[0]
                    print "MAF "
                    print mafScore
                except:
                    mafScore = np.nan
                    
                fitline_json = fitline.to_json(orient='records')
                
                 # getting summary info for run (as one-entry dict)
                runSummary = thresher.getSingleSummaryDf(dfi)
                
                # adding mafScore to summary
                runSummary['mafScore'] = mafScore
                
                print runSummary
                
                # adding predicted hr and speed values
                #dfi = thresher.getPred(dfi)

                # saving entry to database
                Activity.objects.create(act_id = activityId, name=str(activities[i].name), description=activities[i].description, act_type=activities[i].type, date=activities[i].start_date_local, timezone=activities[i].timezone, df=dfi.to_json(orient='records'), avgHr=runSummary['avgHr'], hrVar=runSummary['variation'], realMiles=runSummary['realMiles'], recovery=runSummary['recovery'], easy=runSummary['easy'], stamina=runSummary['stamina'], impulse=runSummary['impulse'], totalTime=runSummary['totalTime'], totalDist=runSummary['totalDist'], climb=runSummary['climb'], fitline=fitline_json, mafScore=mafScore, athlete=ath)
                
                # updating runs summary
                existingSummary = existingSummary.append(runSummary, ignore_index=True)
                existingFitlines = existingFitlines.append(fitline, ignore_index=True)
                masterList = masterList.append(dfi, ignore_index=True)
                
            except:
                continue    
    
    
    # saving updated runs summary to athlete profile
    ath.runsSummary = existingSummary.to_json(orient='records')
    ath.save(update_fields=['runsSummary'])
    
    existingSummary.to_pickle("runsSummary.txt")
    
    # saving updated runs summary to athlete profile
    ath.fitLines = existingFitlines.to_json(orient='records')
    ath.save(update_fields=['fitLines'])
    
    ath.masterList = masterList.to_json(orient='records')
    ath.save(update_fields=['masterList'])
    
    # testing...
    existingSummary = pd.read_json(ath.runsSummary)
    #print(existingSummary)
    
    existingFitlines = pd.read_json(ath.fitLines)
    #print(existingFitlines)

    
    global path
    path = os.path.dirname(__file__)
    # updating dataframe, pickling for use in other views
    #global df
    #df = thresher.masterAssemble(client) 
    
    masterDf = pd.read_json(ath.masterList)
    #print(masterDf)
    masterDf.to_pickle(str(path)+"/"+str(athlete.id)+"masterDf.txt")

    return render(request, 'stravaChimp/authorization.html', {'code':code, 'access_token':access_token, 'athleteId':athleteId})
initValue = 15
softmax = True
skip = True
category = 'Softmax or epsilon-greedy2'

game = pg.PredatorGame((0,0), (5,5), (11,11))

if not skip:

	results = dict()
	results['epsilon-greedy'], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, False)
	results['softmax'], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, True)
	
	results['episode'] = range(0,episodes)
	dataF = DataFrame(results)
	dataF.to_pickle('data/'+category+str(softmax))
else:
	dataF = pd.read_pickle('data/'+category+str(softmax))

category = 'Softmax or epsilon-greedy'
episodeData = pd.melt(dataF, id_vars=['episode'], var_name=category)


p = ggplot(episodeData, aes('episode', 'value', color=category)) +\
    geom_line() +\
    theme_bw() + theme() + ylab("Steps") + xlab("Episodes") + ylim(0,60)
print p
category = 'Softmax or epsilon-greedy2'
ggsave(p, "plots/"+category+str(softmax)+".png")
ggsave(p, "plots/"+category+str(softmax)+".pdf")
def run_bl_analysis( pickles_folder = 0 ):
    import matplotlib.pyplot as plt
    from os                           import listdir
    from os.path                      import join
    from pandas                       import read_pickle, DataFrame
    from article2_time_resolved_routines import find_nearest

    if not pickles_folder:
        pickles_folder = '/home/carlos/Documents/PhD/Articles/'+\
                'Article_3/Scripts/time_resolved/averaged_data'

    case_pickles = [
        f for f in listdir( pickles_folder ) if f.endswith(".p") \
        if not 'Slit' in f and 'alpha0' in f and 'phi0' in f\
        and not "mean_flow_rotated" in f
    ]

    bl_df = DataFrame()

    fig, ax = plt.subplots( 1, 1 )

    for cp in case_pickles:
        case_bl_df = DataFrame()

        df = read_pickle( join( pickles_folder, cp ) )
        df = df.sort_values( by = [ 'x', 'y' ] )

        if 'loc00' in cp and not 'STE' in cp:
            x_bl_loc = 38
        elif 'loc05' in cp:
            x_bl_loc = 18
        elif 'loc10' in cp or 'STE' in cp:
            x_bl_loc = -2

        available_x_loc = find_nearest( x_bl_loc, df.x.values )

        trailing_edge,phi,alpha,U,z = \
                decript_case_name(cp)

        case_name = "{0}_a{1}_p{2}_U20_z{3:02.0f}_tr".\
                format( trailing_edge, alpha, phi, float(z)*20 )

        print "   Running {0}".format(case_name)

        # First get the edge velocity, because it needs to be cleaned up a bit #
        ue_df = DataFrame()
        for x in df.x.unique():
            local_x_df = df[ ( df.x == x ) & ( df.y >= 0 ) ]

            ue_df = ue_df.append(
                { 'U_e' : get_edge_velocity( local_x_df ),
                 'x' : x}, ignore_index = True
            )
        # ######################################################################

        ue_df = clean_data( ue_df, 'U_e' , window = 10, threshold = 1.0 )

        for x , U_e_x in zip( ue_df.x.values, ue_df.U_e.values ):
            local_x_df = df[ ( df.x == x ) & ( df.y >= 0 ) & ( df.y < 20 ) ]

            if x == available_x_loc:
                ax.plot( 
                    local_x_df.u / U_e_x,
                    local_x_df.y, 
                    label = cp.replace("_"," ") 
                )

            U_e_loc, delta_99, delta_displacement, delta_momentum = \
                    get_boundary_layer_values( local_x_df, U_e_x )

            data = {
                'case':               case_name,
                'Ue':                 U_e_x,
                'delta_99':           delta_99,
                'delta_displacement': delta_displacement,
                'delta_momentum':     delta_momentum,
                'x':                  x,
                'trailing_edge':      trailing_edge,
                'phi':                phi,
                'alpha':              alpha,
                'z':                  z
            }

            case_bl_df = case_bl_df.append( 
                DataFrame( data, index = [0] ),
                ignore_index = True
            )

        if 'delta_99' in case_bl_df.columns:
            case_bl_df = clean_data( case_bl_df, 'delta_99', window = 10 , 
                                    threshold = 1.0 )

        bl_df = bl_df.append( case_bl_df, ignore_index = True )

    bl_df.to_pickle("BLData_staged.p")

    plt.legend( loc = 'best' )
    plt.xlim( 0, 1 )
    plt.savefig( "InterestingBLs.png" )
    def test_detect_chained_assignment(self):

        pd.set_option('chained_assignment', 'raise')

        # work with the chain
        expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB'))
        df = DataFrame(np.arange(4).reshape(2, 2),
                       columns=list('AB'), dtype='int64')
        assert df.is_copy is None

        df['A'][0] = -5
        df['A'][1] = -6
        tm.assert_frame_equal(df, expected)

        # test with the chaining
        df = DataFrame({'A': Series(range(2), dtype='int64'),
                        'B': np.array(np.arange(2, 4), dtype=np.float64)})
        assert df.is_copy is None

        with pytest.raises(com.SettingWithCopyError):
            df['A'][0] = -5

        with pytest.raises(com.SettingWithCopyError):
            df['A'][1] = np.nan

        assert df['A'].is_copy is None

        # Using a copy (the chain), fails
        df = DataFrame({'A': Series(range(2), dtype='int64'),
                        'B': np.array(np.arange(2, 4), dtype=np.float64)})

        with pytest.raises(com.SettingWithCopyError):
            df.loc[0]['A'] = -5

        # Doc example
        df = DataFrame({'a': ['one', 'one', 'two', 'three',
                              'two', 'one', 'six'],
                        'c': Series(range(7), dtype='int64')})
        assert df.is_copy is None

        with pytest.raises(com.SettingWithCopyError):
            indexer = df.a.str.startswith('o')
            df[indexer]['c'] = 42

        expected = DataFrame({'A': [111, 'bbb', 'ccc'], 'B': [1, 2, 3]})
        df = DataFrame({'A': ['aaa', 'bbb', 'ccc'], 'B': [1, 2, 3]})

        with pytest.raises(com.SettingWithCopyError):
            df['A'][0] = 111

        with pytest.raises(com.SettingWithCopyError):
            df.loc[0]['A'] = 111

        df.loc[0, 'A'] = 111
        tm.assert_frame_equal(df, expected)

        # gh-5475: Make sure that is_copy is picked up reconstruction
        df = DataFrame({"A": [1, 2]})
        assert df.is_copy is None

        with tm.ensure_clean('__tmp__pickle') as path:
            df.to_pickle(path)
            df2 = pd.read_pickle(path)
            df2["B"] = df2["A"]
            df2["B"] = df2["A"]

        # gh-5597: a spurious raise as we are setting the entire column here
        from string import ascii_letters as letters

        def random_text(nobs=100):
            df = []
            for i in range(nobs):
                idx = np.random.randint(len(letters), size=2)
                idx.sort()

                df.append([letters[idx[0]:idx[1]]])

            return DataFrame(df, columns=['letters'])

        df = random_text(100000)

        # Always a copy
        x = df.iloc[[0, 1, 2]]
        assert x.is_copy is not None

        x = df.iloc[[0, 1, 2, 4]]
        assert x.is_copy is not None

        # Explicitly copy
        indexer = df.letters.apply(lambda x: len(x) > 10)
        df = df.loc[indexer].copy()

        assert df.is_copy is None
        df['letters'] = df['letters'].apply(str.lower)

        # Implicitly take
        df = random_text(100000)
        indexer = df.letters.apply(lambda x: len(x) > 10)
        df = df.loc[indexer]

        assert df.is_copy is not None
        df['letters'] = df['letters'].apply(str.lower)

        # Implicitly take 2
        df = random_text(100000)
        indexer = df.letters.apply(lambda x: len(x) > 10)

        df = df.loc[indexer]
        assert df.is_copy is not None
        df.loc[:, 'letters'] = df['letters'].apply(str.lower)

        # Should be ok even though it's a copy!
        assert df.is_copy is None

        df['letters'] = df['letters'].apply(str.lower)
        assert df.is_copy is None

        df = random_text(100000)
        indexer = df.letters.apply(lambda x: len(x) > 10)
        df.loc[indexer, 'letters'] = (
            df.loc[indexer, 'letters'].apply(str.lower))

        # an identical take, so no copy
        df = DataFrame({'a': [1]}).dropna()
        assert df.is_copy is None
        df['a'] += 1

        # Inplace ops, originally from:
        # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug
        a = [12, 23]
        b = [123, None]
        c = [1234, 2345]
        d = [12345, 23456]
        tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'),
                  ('ears', 'right')]
        events = {('eyes', 'left'): a,
                  ('eyes', 'right'): b,
                  ('ears', 'left'): c,
                  ('ears', 'right'): d}
        multiind = MultiIndex.from_tuples(tuples, names=['part', 'side'])
        zed = DataFrame(events, index=['a', 'b'], columns=multiind)

        with pytest.raises(com.SettingWithCopyError):
            zed['eyes']['right'].fillna(value=555, inplace=True)

        df = DataFrame(np.random.randn(10, 4))
        s = df.iloc[:, 0].sort_values()

        tm.assert_series_equal(s, df.iloc[:, 0].sort_values())
        tm.assert_series_equal(s, df[0].sort_values())

        # see gh-6025: false positives
        df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]})
        str(df)

        df['column1'] = df['column1'] + 'b'
        str(df)

        df = df[df['column2'] != 8]
        str(df)

        df['column1'] = df['column1'] + 'c'
        str(df)

        # from SO:
        # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
        df = DataFrame(np.arange(0, 9), columns=['count'])
        df['group'] = 'b'

        with pytest.raises(com.SettingWithCopyError):
            df.iloc[0:5]['group'] = 'a'

        # Mixed type setting but same dtype & changing dtype
        df = DataFrame(dict(A=date_range('20130101', periods=5),
                            B=np.random.randn(5),
                            C=np.arange(5, dtype='int64'),
                            D=list('abcde')))

        with pytest.raises(com.SettingWithCopyError):
            df.loc[2]['D'] = 'foo'

        with pytest.raises(com.SettingWithCopyError):
            df.loc[2]['C'] = 'foo'

        with pytest.raises(com.SettingWithCopyError):
            df['C'][2] = 'foo'
            elif nPreds == 2:
                game.predCoords = game.initPredCoords = [(0, 0), (10, 10)]
            elif nPreds == 3:
                game.predCoords = game.initPredCoords = [(0, 0), (10, 10), (0,10)]
            elif nPreds == 4:
                game.predCoords = game.initPredCoords = [(0, 0), (10, 10), (0,10), (10,0)]
            results[nPreds], avgRMS, randomReturnValues[nPreds] = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
            winRatioDict[nPreds] = randomReturnValues[nPreds]['winratio']
    else:
        sys.exit()

    results['episode'] = range(1,episodes+1)
    winRatioDict['episode'] = range(1,episodes+1)

    dataF_steps = DataFrame(results)
    dataF_steps.to_pickle('data/Q_steps'+str(samples)+str(episodes)+category+str(softmax))

    dataF_winratio = DataFrame(winRatioDict)
    dataF_winratio.to_pickle('data/Q_winratio'+str(samples)+str(episodes)+category+str(softmax))
else:
    dataF_steps = pd.read_pickle('data/Q_steps'+str(samples)+str(episodes)+category+str(softmax))
    dataF_winratio = pd.read_pickle('data/Q_winratio'+str(samples)+str(episodes)+category+str(softmax))

if graphtype == 'steps':
    dataToPlot = dataF_steps
    ylabel = 'Steps'
elif graphtype == 'winratio':
    dataToPlot = dataF_winratio
    ylabel = 'Win Ratio'

if smoothing:
Example #50
0
initValue = 15
theta=0.00001
softmax = False
skip = False

game = pg.PredatorGame((0,0), (5,5), (11,11))
if not skip:


	results = dict()
	for initValue in [0, 1, 10, 15]:
		results[initValue] = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
		print initValue
	results['episode'] = range(0,episodes)
	dataF = DataFrame(results)
	dataF.to_pickle('data/initValues'+str(episodes))
else:
	dataF = pd.read_pickle('data/initValues'+str(episodes))

episodeData = pd.melt(dataF, id_vars=['episode'], var_name='initValue')

# plt.ioff()
#x = qplot(range(0,4), [0.68834, 0.76024, 0.82407, 0.82113], geom = ["point", "line"])
#print x
# print qplot([0,1], [0.68834, 0.76024])
p = ggplot(episodeData, aes('episode', 'value', color='initValue')) +\
    geom_line() +\
    theme_bw() + theme() + ylab("Steps") + xlab("Episodes") + ylim(0,60)
print p
ggsave(p, "plots/initValues"+str(episodes)+".png")
ggsave(p, "plots/initValues"+str(episodes)+".pdf")
Example #51
0
class WingDetector(object):

    def __init__(self, zoomFMF_filepath, bag_filepath, dTarget, arena_centre, RETRACK, tempdir=None ):
        
        
        self.fmf_file = zoomFMF_filepath
        self.fmf = FMF.FlyMovie(self.fmf_file)
        
        self.bag_fn = bag_filepath
        self.bagdf = self.get_data_from_bag(self.bag_fn)
        self.bagdf = self.compute_body_axes(self.bagdf)
        self.positions = self.get_positions_from_bag(self.bag_fn)
        self.positions.loc[self.positions['Px'] == 1000000, 'Px'] = np.nan
        self.positions.loc[self.positions['Py'] == 1000000, 'Py'] = np.nan
        
        self.dTarget = dTarget
        (self.arena_centre) = arena_centre
        if tempdir is not None:
            self.saveImage = True
            if tempdir[-1] == '/':
                pass
            else:
                tempdir = tempdir + '/'
            self._tempdir = tempdir
        else: self.saveImage = False
        
        self.DEBUGGING_DIR = self.fmf_file.rsplit('/',1)[0] + '/tracking_cache'
        
        if not os.path.exists(self.DEBUGGING_DIR) == True:
            os.makedirs(self.DEBUGGING_DIR)
        
        self.DEBUGGING_DIR = self.DEBUGGING_DIR + '/'
        self.error_count = 0
        self.ERROR_REPORTING = False
        self.retrack = RETRACK
        self.font = cv2.FONT_HERSHEY_SIMPLEX
        
        self.previous_head_extended = None
        self.flipped = 0
        self.adjust_tracking_parameters = ((0,0,0),(0,0,0),(0,0,0))
        self.total_errors = 0
       
        self.wingData = DataFrame({'BodyAxis':[],  'leftAngle':[], 'leftWingLength':[], 'Length':[],  'rightAngle':[],'rightWingLength':[],'target_angle_TTM':[], 'target_distance_TTM':[], 'Timestamp':[],'Width':[]}, dtype=np.float64)            

        self.tracking_info = DataFrame({'a_wingAngle_left':[],'a_wingArea_left':[],'b_wingAngle_right':[], 'b_wingArea_right':[], 'c_head_location_x':[],'c_head_location_y':[], 'd_bodyAxis':[], 'e_centroid_x':[], 'e_centroid_y':[], 'f_dTarget_TTM':[], 'g_approachAngle_TTM':[]}, dtype=np.float64)
        
        self.wingMetrics = DataFrame({'leftArea':[],'leftLength':[],'leftTheta':[],
                                     'rightArea':[],'rightLength':[],'rightTheta':[]}, dtype=np.float64)
                                     
                                     
    def execute(self):
    
        total_frames = self.fmf.get_n_frames()
        
        if not self.ERROR_REPORTING:
            progress = self.get_progress_bar("TRACKED", total_frames) 
        else:
            pass
        
        if (os.path.exists(self.DEBUGGING_DIR + 'wingdata_cache.pickle')) and not (self.retrack):
            self.wingData = pd.read_pickle(self.DEBUGGING_DIR + 'wingdata_cache.pickle')
            self.wingData.columns= ['BodyAxis','leftAngle','leftWingLength','Length','rightAngle','rightWingLength','target_angle_TTM',
                                     'target_distance_TTM','Timestamp','Width']
            startframe = self.wingData.index[-1]
            print self.fmf_file.split('/')[-1], ': beginning from cache at: ', startframe
        else:
            startframe = 0
            
        for frame_number in range(startframe,total_frames,1):
            if self.ERROR_REPORTING:
                progress = self.get_progress_bar("ERROR_RATE", 2*frame_number+1) 
                progress.update(self.total_errors+1)   
            else:
                progress.update(frame_number) 
            self.ERROR_DETECTED= False
            self.error_count = 0
            self.adjust_tracking_parameters = ((0,0,0),(0,0,0),(0,0,0))
            try:
                self.detectWings(self.saveImage, False, frame_number)  #MAKE FIRST OPTION TRUE TO SAVE TRACKING MOVIES.
            except:
                continue
        print self.fmf_file.split('/')[-1], 100.0*self.total_errors/total_frames, '% error rate'
        return



    def make_movie(self,imagepath,filename,mp4fps):

        #write x264 mp4
        tmpmov = "%s/movie.y4m" % imagepath

        sh.mplayer("mf://%s/*.png" % imagepath,
                   "-mf", "fps=%d" % mp4fps,
                   "-vo", "yuv4mpeg:file=%s" % tmpmov,
                   "-ao", "null",
                   "-nosound", "-noframedrop", "-benchmark", "-nolirc"
        )

        sh.x264("--output=%s" % filename,
                "%s" % tmpmov,
        )


        try:
            os.unlink(tmpmov)
            shutil.rmtree(self._tempdir)
        except OSError:
            pass

        
    def get_progress_bar(self, name, maxval):
        widgets = ["%s: " % name, progressbar.Percentage(),
                   progressbar.Bar(), progressbar.ETA()]
        pbar = progressbar.ProgressBar(widgets=widgets,maxval=maxval).start()
        return pbar


    def get_wingAngle(self, frame_number):
        t, L, R = self.detectWings(frame_number)
        return t, L, R
    
    def devignette(self, frame):
        
        if int(self.fmf_file.rsplit('_')[-2]) >= 151006:
            V_coeff =[ 0.608421,0.000660594,0.00071838,
                       -6.83654e-07,2.29008e-07,-6.11814e-07,
                       -8.79999e-11,-1.63231e-10,-2.10072e-11,-2.10298e-10]
        else:  
            
            V_coeff = [  5.198890393267561e-01,
                         1.217460251226269e-03,
                         1.189236244172212e-03,
                        -1.476571361684494e-06,
                        -6.157281314884152e-07,
                        -1.611555274365404e-06,
                         2.521929214022170e-10,
                         4.392272775279915e-10,
                         2.268726532499034e-10,
                         4.244172315090120e-10]



        mask = np.ones([len(frame[0]), len(frame)])

        xx, yy = np.meshgrid(np.arange(0,len(frame[0]),1), np.arange(0,len(frame),1))

        V_fit = mask*V_coeff[0] + xx*V_coeff[1] + yy*V_coeff[2] + xx**2*V_coeff[3] + xx*yy*V_coeff[4] + yy**2*V_coeff[5] + xx**3*V_coeff[6] + xx**2*yy*V_coeff[7] + xx*yy**2*V_coeff[8] + yy**3*V_coeff[9]                                         

        devign = (frame / V_fit).astype(np.uint8)
        
        return devign



    def get_data_from_bag(self, bagfile):
        bag = rosbag.Bag(bagfile)
        head_x = []
        head_y = []
        body_x = []
        body_y = []
        times = []
        for topic, msg, t in bag.read_messages('/flymad/laser_head_delta'):
            head_x.append(msg.head_x)
            head_y.append(msg.head_y)
            body_x.append(msg.body_x)
            body_y.append(msg.body_y)
            times.append((t.secs + t.nsecs*1e-9))
            
        newdf = pd.DataFrame({'Timestamp':times, 
                              'Hx':np.around(head_x), 
                              'Hy':np.around(head_y),
                              'Bx':np.around(body_x), 
                              'By':np.around(body_y)})
                              
        newdf = newdf[newdf.Hx < 1000000]    #failed detection msgs are filled with value 1e6.
        newdf = utilities.convert_timestamps(newdf)
        return newdf

    def get_positions_from_bag(self, bagfile):
        bag = rosbag.Bag(bagfile)
        px = []
        py = []
        times = []
        for topic, msg, t in bag.read_messages('/flymad/raw_2d_positions'):
            try:
                px.append(msg.points[0].x)
                py.append(msg.points[0].y)
            except:
                px.append(1000000)
                py.append(1000000)
            times.append((t.secs + t.nsecs*1e-9)) 
        newdf = pd.DataFrame({'Timestamp':times, 
                          'Px':np.around(px), 
                          'Py':np.around(py)})   
        newdf = utilities.convert_timestamps(newdf)
        return newdf

    def compute_body_axes(self, newdf):
        # calculate 'norm' the distance between body and head points:
        newdf['norm'] = np.sqrt((newdf.Hx-newdf.Bx)**2 + (newdf.Hy-newdf.By)**2)

        newdf['slope'] = (newdf.Hy-newdf.By) / (newdf.Hx-newdf.Bx)
        newdf['perp'] = -1*(newdf.Hx-newdf.Bx) / (newdf.Hy-newdf.By)
        newdf['yint'] = newdf.Hy - (newdf.slope * newdf.Hx)
        newdf['perpInt'] = newdf.Hy - (newdf.perp * newdf.Hx)  
        return newdf




    def detectWings(self, saveImage, debugging=False, framenumber=0):#, bodyThresh, wingThresh):

        frame, timestamp = self.fmf.get_frame(framenumber)
        
        timestamp_FMT = pd.to_datetime(timestamp, unit='s', utc=True).tz_convert('US/Eastern')
        timestring = "%.2f" % (pd.to_datetime(timestamp) - pd.to_datetime(0)).total_seconds()
        
        # COMPUTER VISION:
        frame = self.devignette(frame)
        im = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR) #must be uint8 array
        imgray = cv2.cvtColor(im,cv2.COLOR_BGR2GRAY)
        
        kernel = np.ones((5,5),np.uint8)


        Px = self.positions.Px.asof(timestamp_FMT)    #SILLY HACK FOR 'MISMATCHING' INDICES. STUPID PANDAS.
        Py = self.positions.Py.asof(timestamp_FMT)
        
        if Px == np.nan or Py == np.nan:
            if self.saveImage == True:
                imcopy = im.copy()
                cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy) 
            self.wingData.loc[framenumber] = [np.nan, np.nan, np.nan,  np.nan,   np.nan, np.nan, timestamp, np.nan]
            return np.nan, np.nan, np.nan,  np.nan,   np.nan, np.nan, timestamp, np.nan
        
        distance = self.get_distance_between_coords((Px,Py), self.arena_centre)
        targ_dist = self.dTarget.asof(timestamp_FMT)        


        #FLY FEATURES DERIVED FROM BAG FILE:
        try:
            centroid, head = self.get_centroid_and_head(timestamp_FMT)
            backPoint = tuple(sum(y) / len(y) for y in zip(centroid, head))
            headLine = self.compute_perpendicular_from_points(head, centroid)
            axisLine = self.compute_axis_from_points(head, centroid)
            bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist)
            BagData = True
        except:
            centroid, head = (0,0),(0,0)
            bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist) 
            BagData = False          


        #FIT ELLIPSE TO BODY:
        ret2, body = cv2.threshold(imgray, ellThresh[0], 255, cv2.THRESH_BINARY)
        #ellipseFitter = cv2.dilate(body, kernel, iterations=ellThresh[1])
        ellipseFitter = cv2.erode(body, kernel, iterations=ellThresh[2])
        contourImage = ellipseFitter.copy()
        bodyCont, hierarchy1 = cv2.findContours(contourImage, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
        

        bodyEllipse=None
        bodyContour=None
        
        if BagData:
            for cnt in bodyCont:
                if cv2.contourArea(cnt) <=900000:
                    if cv2.contourArea(cnt) >= 7000:
                        ellipse= cv2.fitEllipse(cnt)
                        if self.pointInEllipse(centroid[0],centroid[1],ellipse[0][0],ellipse[0][1],ellipse[1][0],ellipse[1][1],ellipse[2]):
                            bodyEllipse = ellipse
                            bodyContour=cnt
                            slope = self.convert_ellipseAngle_to_slope(bodyEllipse[2])
                            yint = -1.0*slope*bodyEllipse[0][0] + bodyEllipse[0][1]
                            xint = (-1.0*yint / slope)
                            axisLine = slope, yint, xint
                            head = self.pointOfIntersection(headLine[0],headLine[1], axisLine[0], axisLine[1])
        if bodyEllipse == None:
            for cnt in bodyCont:
                if cv2.contourArea(cnt) <=900000:
                    if cv2.contourArea(cnt) >= 7000:
                        ellipse= cv2.fitEllipse(cnt)
                        bodyEllipse = ellipse
                        bodyContour=cnt
        
        if bodyEllipse == None:
            #print "ERROR: cannot detect body ellipse in frame: ", framenumber
            
            imcopy = im.copy()
            cv2.putText(imcopy, "ERROR", (480,530), self.font, 1, (255,255,255), 3)
            try:
                self.wingData.loc[framenumber] = self.wingData.loc[framenumber-1]#[np.nan, np.nan, np.nan,  np.nan, np.nan. np.nan]
            except:
                self.wingData.loc[framenumber] = [np.nan, np.nan, np.nan,  np.nan,   np.nan, np.nan, np.nan, np.nan, timestamp, np.nan]

            if self.saveImage == True:
                cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy)  
            return timestamp, np.nan, np.nan, np.nan, np.nan, np.nan             
        
        
        
        (f1, f2) = self.fociOfEllipse(bodyEllipse[0][0],bodyEllipse[0][1],bodyEllipse[1][0],bodyEllipse[1][1],bodyEllipse[2])

        head = self.get_nearest(head, [f1,f2])
        tail = self.get_furthest(head, [f1,f2])
        centroid = (bodyEllipse[0][0],bodyEllipse[0][1])
        backPoint = tuple(sum(y) / len(y) for y in zip(centroid, head))
        backPoint = tuple(sum(y) / len(y) for y in zip(centroid, backPoint))
        slope = self.convert_ellipseAngle_to_slope(bodyEllipse[2])
        yint = -1.0*slope*bodyEllipse[0][0] + bodyEllipse[0][1]
        xint = (-1.0*yint / slope)
        axisLine = slope, yint, xint

        centroid = bodyEllipse[0]
        headLine = self.compute_perpendicular_from_points(head, centroid)
        midline = self.compute_perpendicular_from_points(centroid, head)
        tailLine = self.compute_perpendicular_from_points(tail, centroid)
    
    

        
        #FLIP BODY AXIS BASED ON PREVIOUS FRAMES
        
        
        
        ########################################################################################
        
        try:
            if not self.previous_head_extended == None:
            
                #print framenumber, ': ', self.wingData.ix[framenumber-1].BodyAxis, bodyEllipse[2], np.cos(np.radians(self.wingData.ix[framenumber-1].BodyAxis - bodyEllipse[2])), '\t', self.flipped
            
                if not self.check_laterality(self.previous_head_extended, self.extend_vector(centroid,head), midline[0], midline[1], midline[2]):
                    #debugging = True
                    if self.flipped == 100:
                        for x in range(-101,1):
                            self.previous_head_extended = None
                            self.detectWings(True, True, framenumber+x)
                        self.flipped = 0
                        return
                    head, tail = tail, head
                    headLine, tailLine = tailLine, headLine
                    backPoint = tuple(sum(y) / len(y) for y in zip(centroid, head))
                    backPoint = tuple(sum(y) / len(y) for y in zip(centroid, backPoint))

                    self.flipped += 1
                else: self.flipped = 0
        except:
            pass
            #print framenumber, ": Unable to assess body orientation."
        self.previous_head_extended = self.extend_vector(centroid,head)
        
        
        body_length = self.get_distance_between_coords(head,tail)
        abd_length =  self.get_distance_between_coords(backPoint, tail)        
        body_angle = self.angle_from_vertical(tail, head)


        if body_length >= 425:
            imcopy = im.copy()
            cv2.putText(imcopy, "ERROR", (480,530), self.font, 1, (255,255,255), 3)
            if self.saveImage == True:
                cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy)
            self.wingData.loc[framenumber] = [np.nan, np.nan, np.nan,  np.nan,   np.nan, np.nan, np.nan, np.nan, timestamp, np.nan]
            return np.nan, np.nan, np.nan,  np.nan,   np.nan, np.nan, timestamp, np.nan

        WIDTH = bodyEllipse[1][0]

        ########################### TARGET TOUCH #############################################
        
        if (bodyContour != None) & (targ_dist <= 5.0):
            imcopy = imgray.copy()
            bodymask = imcopy / imcopy - 1.0  #zeros, with dimensions of image.
            cv2.fillPoly(bodymask, [bodyContour],(255-imcopy.max()))
            bodymask = cv2.dilate(bodymask, kernel, iterations=4)
            imcopy = (imcopy + bodymask).astype(np.uint8)
            ret, trunk = cv2.threshold(imcopy, 80, 90, cv2.THRESH_TRUNC)
            targetContour, target_distance_TTM, approach_angle_TTM = self.get_targets(trunk, head, centroid, body_angle)
        else:
            targetContour, target_distance_TTM, approach_angle_TTM = [], np.nan, np.nan




        ############################# DEFINE WINGS #########################################################

        wingTips, wholeWings, wingArea = [],[],[]

        wingTips, wholeWings, wingArea = self.get_candidate_wings(imgray, kernel, headLine, centroid, backPoint, body_length, abd_length, axisLine, wingTips, wholeWings, wingArea, timestamp_FMT, distance, targ_dist)

        polynomial = np.poly1d([  -15392.02683546,  29209.68050119,   3237.47165583])

        wingSets = pd.DataFrame({'Tips':wingTips, 'Shape':wholeWings, 'Area':wingArea, 'Theta':np.empty(len(wingTips)).fill(0)})
        wingSets['Theta'] = np.nan
        wingSets['Side'] = np.nan
        wingSets['Length'] = np.nan
        
        #wingSets.to_pickle('/groups/dickson/home/bathd/Desktop/wingsets.pickle')
        
        for x in np.arange(len(wingSets)):
            wingSets.loc[x,'Theta'] = self.compute_angle_given_three_points(backPoint, wingSets.loc[x,'Tips'], centroid)
            wingSets.loc[x,'Length'] = self.get_distance_between_coords(backPoint, wingSets.loc[x,'Tips'])
        wingSets.loc[wingSets['Theta'] >=np.pi,'Theta'] -= 2.0*np.pi
        wingSets.loc[wingSets['Theta'] <=-1.0*np.pi,'Theta'] += 2.0*np.pi
        wingSets.loc[wingSets['Theta'] < 0.0, 'Side'] = 'Right'
        wingSets.loc[wingSets['Theta'] >= 0.0, 'Side'] = 'Left'
        wingSets.loc[wingSets['Side'] == 'Right','Theta'] *= -1.0
        #wingSets['polydif'] = (wingSets['Theta'] - polynomial(wingSets['Area'])) / (wingSets['Area']/20000.0)
        wingSets = wingSets[wingSets['Area'] > ((polynomial(wingSets['Theta'])/1.5) -2000.0*wingSets['Theta'] -5000.0)]
        wingSets = wingSets[wingSets['Area'] < ((polynomial(wingSets['Theta'])*1.3) +2000.0*wingSets['Theta'] + 3000.0)]
        wingSets = wingSets[(wingSets['Length'] >=250) & (wingSets['Length'] < 375)]
        wingSets = wingSets[(wingSets['Theta'] <= ((np.pi)/1.75) )]
        #wingSets = wingSets[(wingSets['polydif'] >= -0.8) & (wingSets['polydif'] <= 1.0)]
        wingSets = wingSets[(wingSets['Area'] >= 1000) & (wingSets['Area'] <= 30000)]

        
        try:
            #leftWing = wingSets.ix[wingSets[wingSets['Side']=='Left']['polydif'].abs().idxmin()]
            leftWing = wingSets.ix[wingSets[wingSets['Side']=='Left']['Area'].abs().idxmax()]
        except:
            leftWing = wingSets[0:0]
            leftWing.ix[0] = np.nan
            leftWing.set_value(0,'Tips',tuple(tail))
            leftWing.set_value(0,'Shape',[[0,0]])
            leftWing = leftWing.ix[0]
            self.total_errors += 1
        try:
            #rightWing = wingSets.ix[wingSets[wingSets['Side']=='Right']['polydif'].abs().idxmin()]
            rightWing = wingSets.ix[wingSets[wingSets['Side']=='Right']['Area'].abs().idxmax()]
        except:
            rightWing = wingSets[0:0]
            rightWing.ix[0] = np.nan
            rightWing.set_value(0,'Tips',tuple(tail))
            rightWing.set_value(0,'Shape',[[0,0]])
            rightWing = rightWing.ix[0]
            self.total_errors +=1


        if saveImage == True:
            imcopy = im.copy()
            try:
                cv2.drawContours(imcopy,[leftWing.Shape],0,(255,0,0),1)
            except: 
                pass
            try:
                cv2.drawContours(imcopy,[rightWing.Shape],0,(0,255,255),1)
            except: 
                pass
            try:
                cv2.drawContours(imcopy,[targetContour],0,(255,128,128),6)
            except: 
                pass
            cv2.line(imcopy, (int(head[0]),int(head[1])), (int(tail[0]),int(tail[1])), (255,255,255), 1)
            cv2.line(imcopy, (int(backPoint[0]),int(backPoint[1])), (int(leftWing.Tips[0]),int(leftWing.Tips[1])), (20,20,255),2)
            cv2.line(imcopy, (int(backPoint[0]),int(backPoint[1])), (int(rightWing.Tips[0]),int(rightWing.Tips[1])), (20,255,20),2)
            cv2.circle(imcopy, (int(head[0]),int(head[1])), 3, (255,255,255), -1)
            cv2.circle(imcopy, (int(backPoint[0]),int(backPoint[1])), 5, (255,255,255), -1)
            #cv2.circle(imcopy, (int(centroid[0]),int(centroid[1])), 3, (255,0,255), -1)
            cv2.putText(imcopy, str(np.around(np.degrees(leftWing.Theta), 2))+ 'deg', (10,25), self.font, 1, (20,20,255), 3)
            cv2.putText(imcopy, str(np.around(rightWing.Area, 2)), (450, 65), self.font, 1, (20,255,20), 3)
            cv2.putText(imcopy, str(np.around(leftWing.Area, 2)), (10,65), self.font, 1, (20,20,255), 3)
            cv2.putText(imcopy, str(np.around(np.degrees(rightWing.Theta), 2))+ 'deg', (450, 25), self.font, 1, (20,255,20), 3)
            cv2.putText(imcopy, str(framenumber), (850, 25), self.font, 1, (255,255,255), 3)
            cv2.putText(imcopy, str(np.around(target_distance_TTM, 2)) + 'mm', (10,950), self.font, 1, (100,255,255), 3)
            cv2.putText(imcopy, str(np.around(approach_angle_TTM, 2)) + 'deg', (450, 950), self.font, 1, (100,255,255), 3)
            #cv2.putText(imcopy, timestring, (850, 950), self.font, 1, (255,255,255), 3)
            cv2.imwrite(self._tempdir+'_tmp%05d.png'%(framenumber), imcopy) 
        cv2.destroyAllWindows()

        
        #print framenumber,  "\tL: ", ("%.2f" % np.degrees(leftWingAngle)), ("%.2f" % leftWingLength), '\tR: ', ("%.2f" % (-1.0*np.degrees(rightWingAngle))), ("%.2f" % rightWingLength), '\t',("%.2f" % distance), '\t', str(self.dTarget.asof(timestamp_FMT)), '\t', self.flipped
        
        self.wingData.loc[framenumber] = [body_angle, leftWing.Theta, leftWing.Length,  body_length,
                                          rightWing.Theta, rightWing.Length, 
                                          approach_angle_TTM, target_distance_TTM, timestamp, WIDTH]
        self.tracking_info.loc[framenumber] = [leftWing.Theta, leftWing.Area, rightWing.Theta, 
                                               rightWing.Area, head[0], head[1], body_angle, centroid[0], centroid[1], target_distance_TTM, approach_angle_TTM]

        self.wingMetrics.loc[framenumber] = [leftWing.Area, leftWing.Length, leftWing.Theta, 
                                             rightWing.Area, rightWing.Length, rightWing.Theta]
        if framenumber % 100 == 0:
            self.wingData.to_pickle(self.DEBUGGING_DIR + 'wingdata_cache.pickle')
            self.wingMetrics.to_pickle(self.DEBUGGING_DIR + 'wingMetrics_cache.pickle')
        return body_angle, leftWing.Length, leftWing.Theta, body_length, rightWing.Length,  rightWing.Theta, timestamp, WIDTH

    def get_targets(self, fly_erased_img, headpoint, centroidpoint, _bodyAxis):

        kernel = np.ones((5,5),np.uint8)
        _, mask = cv2.threshold(fly_erased_img, 60, 255, cv2.THRESH_BINARY)
        mask = cv2.erode(mask, kernel, iterations=1)
        contourImage = mask.copy()
        contourImage = np.pad(contourImage,((2,2),(2,2)), mode='maximum')
        contours, hierarchy1 = cv2.findContours(contourImage, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        hierarchy = hierarchy1[0]
        
        for x in hierarchy:
            if x[3] <0:
                parent = x
        
        #headpoint = (int(track.loc[framenumber, 'c_head_location_x']), int(track.loc[framenumber, 'c_head_location_y']))
        candidateTargets = []
        
        for component in zip(contours, hierarchy):
            c = component[0]
            h = component[1]
            centroidCheck = cv2.pointPolygonTest(c,centroidpoint,True)
            if centroidCheck <=0:
                if np.array_equal(hierarchy[h[3]], parent) :  #is in outer hierarchy (parent is edge.)  
                    if h[2] > 0:   # has child (targets have inner and outer edge)
                        if (cv2.contourArea(c) <= 150000) & (cv2.contourArea(c) >= 20000):
                            ellipse = cv2.fitEllipse(c)
                            if not self.pointInEllipse(centroidpoint[0],centroidpoint[1],ellipse[0][0],ellipse[0][1],ellipse[1][0],ellipse[1][1],ellipse[2]):
                                candidateTargets.append(c)
            
            areas = []
            if len(candidateTargets) >0:
                for T in range(len(candidateTargets)): 
                    areas.append(cv2.contourArea(candidateTargets[T]))
            
                TARGET = cv2.convexHull(candidateTargets[areas.index(max(areas))]             )
                M = cv2.moments(TARGET)
                targCentre = (int(M['m10']/M['m00']), int(M['m01']/M['m00']))
                    
                distance = -1.0*cv2.pointPolygonTest(TARGET,headpoint,True) / 135.5 # based on 135.5 pixels per mm
                angle= self.angle_from_vertical(headpoint, targCentre)
                approachAngle= angle - _bodyAxis #track.loc[framenumber, 'd_bodyAxis']
                if approachAngle < 0:
                    approachAngle *= -1.0
                if approachAngle >=180.0:
                    approachAngle -= 180.0
            else:   
                distance = np.nan
                approachAngle = np.nan
                TARGET = None
        return TARGET, distance, approachAngle
    
    def get_candidate_wings(self, imgray, kernel, headLine, centroid, backPoint, body_length, abd_length, axisLine, wingTips, wholeWings, wingArea,timestamp_FMT, distance, targ_dist):
        
        """
        self.adjust_tracking_parameters = ((0,0,0),(0,0,0),(0,0,0))
        bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist)
        
        paramchanges =  [((0,0,0),(0,0,0),(0,0,0)),
                         ((10,0,0),(10,0,0),(0,0,0)),
                         ((-10,0,0),(-10,0,0),(0,0,0)),
                         ((10,1,1),(10,0,-1),(0,0,0)),
                         ((-10,1,1),(-10,0,-1),(0,0,0))]
        
        for p in paramchanges:
                         
            self.adjust_tracking_parameters = p

            bodyThresh, wingThresh, ellThresh = self.get_tracking_thresholds(timestamp_FMT, distance, targ_dist)
        """
        
        edge = self.get_edge(imgray)
        if edge > 115:
            wingThresh = int(0.75*edge + 10.0)
            bodyThresh = int(0.45*edge + 2.5) 
        else:
            wingThresh = int(0.75*edge + 13.0)
            bodyThresh = int(0.55*edge + 2.5)
        if distance >= 170:
            adjustments = [-10,-5,0]
        else:
            adjustments = [-5,0,5]
        for a in adjustments:   
            #DEFINE bodyNotWings AS BODY PORTION PLUS LEGS ETC, USEFUL FOR FINDING WINGS.
            ret1, bodyNotWings = cv2.threshold(imgray, bodyThresh,255,cv2.THRESH_BINARY)
            bodyNotWings = cv2.dilate(bodyNotWings, kernel, iterations=1)
            bodyNotWings = cv2.erode(bodyNotWings, kernel, iterations=1)

            
            #DEFINE wings AS WINGS AND TARGETS BUT NOT BODY.
            ret2, wings = cv2.threshold(imgray, wingThresh+a,1,cv2.THRESH_BINARY_INV)
            test = wings*bodyNotWings
            dilated = cv2.erode(test, kernel, iterations=2)
            #eroded = cv2.dilate(dilated, kernel, iterations=wingThresh[1])
            #dilatedCopy = eroded.copy()
            
            wingCont, hierarchy = cv2.findContours(dilated, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
            
            
            
            for c in wingCont:
                area = cv2.contourArea(c)
                #WINGS MUST BE APPROPRIATE SIZE
                if (area >= 3000):
                    M = cv2.moments(c)
                    cx, cy = int(M['m10']/M['m00']), int(M['m01']/M['m00'])
                    #WINGS MUST BE BEHIND HEAD
                    if self.check_laterality(centroid, (cx,cy), headLine[0], headLine[1], headLine[2]):
                        checkSpot = (c[0][0][0], c[0][0][1])
                        pointSet1 = []
                        pointSet2 = []
                        pointSetTARGET = []
                        for x in c:
                            if self.check_laterality((x[0][0], x[0][1]), centroid, headLine[0], headLine[1], headLine[2]):
                                if self.check_laterality((x[0][0], x[0][1]), checkSpot, axisLine[0], axisLine[1], axisLine[2]):
                                    pointSet1.append(x.tolist())
                                else:
                                    pointSet2.append(x.tolist())
                            else:
                                if targ_dist <=20.0:
                                    pointSetTARGET.append(x.tolist())
                        pointSet1 = np.array(pointSet1).reshape((-1,1,2)).astype(np.int32)
                        pointSet2 = np.array(pointSet2).reshape((-1,1,2)).astype(np.int32)
                        pointSetTARGET = np.array(pointSetTARGET).reshape((-1,1,2)).astype(np.int32)
                        if (len(pointSet1) > 0):
                            if cv2.contourArea(pointSet1) >=833:#(2500/(wingThresh[2]+1)):
                                near, far = self.get_nearest_and_furthest_from_centroid(pointSet1, centroid)
                                if self.get_distance_between_coords(near, centroid) <= 150:
                                    winglength = self.get_distance_between_coords(far, backPoint)
                                    if (winglength <= 2.0*(body_length)) and (winglength >= abd_length):
                                        wingTips.append(far)
                                        wholeWings.append(pointSet1)#(cv2.convexHull(pointSet1))
                                        wingArea.append(cv2.contourArea(pointSet1))
                        if (len(pointSet2) > 0):
                            if cv2.contourArea(pointSet2) >=833:#(2500/(wingThresh[2]+1)):
                                near, far = self.get_nearest_and_furthest_from_centroid(pointSet2, centroid)
                                if self.get_distance_between_coords(near, centroid) <= 150:
                                    winglength = self.get_distance_between_coords(far, backPoint)
                                    if (winglength <= 2.0*(body_length)) and (winglength >= abd_length):
                                        wingTips.append(far)
                                        wholeWings.append(pointSet2)#(cv2.convexHull(pointSet2))
                                        wingArea.append(cv2.contourArea(pointSet2))
        return wingTips, wholeWings, wingArea
    





    def closestpair(self, L):
	    def square(x): return x*x
	    def sqdist(p,q): return square(p[0]-q[0])+square(p[1]-q[1])
	
	    # Work around ridiculous Python inability to change variables in outer scopes
	    # by storing a list "best", where best[0] = smallest sqdist found so far and
	    # best[1] = pair of points giving that value of sqdist.  Then best itself is never
	    # changed, but its elements best[0] and best[1] can be.
	    #
	    # We use the pair L[0],L[1] as our initial guess at a small distance.
	    best = [sqdist(L[0],L[1]), (L[0],L[1])]
	
	    # check whether pair (p,q) forms a closer pair than one seen already
	    def testpair(p,q):
		    d = sqdist(p,q)
		    if d < best[0]:
			    best[0] = d
			    best[1] = p,q
			
	    # merge two sorted lists by y-coordinate
	    def merge(A,B):
		    i = 0
		    j = 0
		    while i < len(A) or j < len(B):
			    if j >= len(B) or (i < len(A) and A[i][1] <= B[j][1]):
				    yield A[i]
				    i += 1
			    else:
				    yield B[j]
				    j += 1

	    # Find closest pair recursively; returns all points sorted by y coordinate
	    def recur(L):
		    if len(L) < 2:
			    return L
		    split = len(L)/2
		    splitx = L[split][0]
		    L = list(merge(recur(L[:split]), recur(L[split:])))

		    # Find possible closest pair across split line
		    #
		    E = [p for p in L if abs(p[0]-splitx) < best[0]]
		    for i in range(len(E)):
			    for j in range(1,8):
				    if i+j < len(E):
					    testpair(E[i],E[i+j])
		    return L
	
	    L.sort()
	    recur(L)
	    return best[1]
                
    def get_edge(self, frame):
        top = frame[0:5].mean()
        bottom = frame[-5:-1].mean()
        left = frame[:,0:5].mean()
        right = frame[:,-5:-1].mean()
        values = sorted([top, bottom, left, right])[1:]
        mean = sum(values) / 3.0
        return mean                           

    def get_distance_between_coords(self, A, B):
        return np.sqrt((A[0]-B[0])**2 + (A[1]-B[1])**2)

    def get_nearest(self, POINT, list_of_points):
        nearest = 1000000000
        for x in list_of_points:
            d = self.get_distance_between_coords(POINT, x)
            if d < nearest:
                nearest = d
                winner = x
        return winner

    def get_furthest(self, POINT, list_of_points):
        furthest = 0.0
        for x in list_of_points:
            d = self.get_distance_between_coords(POINT, x)
            if d > furthest:
                furthest = d
                winner = x
        return winner

    def get_distance_from_body_ellipse(self, bodyCentroid, headPoint, POINT):
        perp_to_centroid = self.compute_perpendicular_from_points(bodyCentroid, headPoint)
        perp_x = bodyCentroid[0] + 10.0
        perp_y = perp_to_centroid[0]*(bodyCentroid[0] + 10.0) + perp_to_centroid[1]
        perpPoint = (perp_x, perp_y)
        THETA = self.compute_angle_given_three_points(bodyCentroid, headPoint, perpPoint)
        
        POINT[0] = a*np.cos(THETA)*np.cos(t) - b*np.sin(THETA)*np.sin(t)
        POINT[1] = a*np.sin(THETA)*np.cos(t) + b*np.cos(THETA)*np.sin(t)
        pass   #INCOMPLETE  
        
    def get_centroid_and_head(self, _timestamp):
        centroid = (int(self.bagdf['Bx'].asof(_timestamp)),int(self.bagdf['By'].asof(_timestamp)))#[self.bagdf.Time >= _timestamp].iloc[0]
        head = (int(self.bagdf['Hx'].asof(_timestamp)),int(self.bagdf['Hy'].asof(_timestamp)))
        return centroid, head



    def get_tracking_thresholds(self, _timestamp, _distance, _dTarget):

        if _dTarget <= 4:
            vals = (65,1,1), (95,1,2), (35,1,1)
        elif _distance <=120:
            vals =  (65,1,1), (95,1,2), (40,1,1)
        elif _distance <=150:
            vals =  (65,1,1), (95,1,2), (40,1,1)
        elif _distance <=185:
            vals = (60,1,1), (80,1,2), (30,1,1) #(60,1,1), (80,1,2), (30,1,1)
        else:
            vals = (40,1,2), (65,1,2), (35,1,1) #(50,1,1), (79,1,2), (35,1,1)   
            
        foo = self.add_nested_tuples(vals, self.adjust_tracking_parameters)
        
        return foo   

    def add_nested_tuples(self, set1, set2):
        return tuple(map(lambda x, y: tuple(map(lambda w,z: w+z, x,y)), set1, set2))

    def get_nearest_and_furthest_from_centroid(self, hullset, centroid):
        #PASS A SET OF POINTS DEFINING A SINGLE CONTOUR, IDEALLY OUTPUT FROM cv2.convexHull
        lowest_distance = 1000000
        lowest_coords = (0,0)
        highest_distance = 0
        highest_coords = (0,0)
        for a in hullset:
            b = (a[0][0], a[0][1])
            distance = self.get_distance_between_coords(centroid, b)
            if distance > highest_distance:
                highest_coords = b
                highest_distance = distance
            if distance < lowest_distance:
                lowest_coords = b
                lowest_distance = distance 
        return lowest_coords, highest_coords
        
    def compute_axis_from_points(self, POINT1, POINT2):
        if float(float(POINT1[0]) - float(POINT2[0]) ) == 0.0:
            XINT = POINT1[0]
            YINT = np.nan
            SLOPE = np.inf
        else:
            SLOPE = ( float(POINT1[1]) - float(POINT2[1])) / ( float(float(POINT1[0]) - float(POINT2[0]) ))
            YINT = POINT1[1] - (SLOPE*POINT1[0])
            if abs(SLOPE) >= 1000000:
                XINT = POINT1[0]
            elif SLOPE == 0.0:
                XINT = np.nan
            else:
                XINT = -1*YINT / SLOPE
        return SLOPE, YINT, XINT

    def convert_ellipseAngle_to_slope(self, _degs): #OPENCV makes silly angles, where up is 0deg, and right is 90deg.
        degs = float(1.0*_degs + 90.0)
        return float(math.tan(math.radians(degs)))

    def pointOfIntersection(self, SLOPE1, YINT1, SLOPE2, YINT2):
        if float(SLOPE1 - SLOPE2) == 0.0:
            return 
        else:
            px = float(YINT2 - YINT1) / float(SLOPE1 - SLOPE2) 
            py = SLOPE1*px + float(YINT1)
        return (px, py)

    def pointInEllipse(self, x,y,xp,yp,d,D,angle):
        #tests if a point[xp,yp] is within
        #boundaries defined by the ellipse
        #of center[x,y], diameters d D, and tilted at angle

        cosa=math.cos(angle)
        sina=math.sin(angle)
        dd=d/2*d/2
        DD=D/2*D/2

        a =math.pow(cosa*(xp-x)+sina*(yp-y),2)
        b =math.pow(sina*(xp-x)-cosa*(yp-y),2)
        ellipse=(a/dd)+(b/DD)

        if ellipse <= 1:
            return True
        else:
            return False

    def fociOfEllipse(self, x,y,d,D,angle):
        #returns coordinates of foci
        #defined by the ellipse
        #of center[x,y], diameters d D, and tilted at angle

        cosa=math.cos(math.radians(angle-90.0))
        sina=math.sin(math.radians(angle-90.0))
        dd=d/2*d/2
        DD=D/2*D/2

      
        c = np.sqrt(DD-dd)
        slope = self.convert_ellipseAngle_to_slope(angle)
        c_x = cosa*c
        c_y = sina*c
        F1 = ((x+c_x),(y+c_y))
        F2 = ((x-c_x),(y-c_y))
        return (F1, F2)
        
        
    def compute_perpendicular_from_points(self, POINT1, POINT2): #perpendicular line through POINT1
        if  float(float(POINT1[1]) - float(POINT2[1]) ) == 0.0:
            XINT = np.nan
            YINT = POINT1[1]
            SLOPE = 0.0
        else:    
            SLOPE = -1.0*( float(POINT1[0]) - float(POINT2[0])) / ( float(float(POINT1[1]) - float(POINT2[1]) ))
            YINT = float(POINT1[1]) - (float(POINT1[0])*SLOPE)
            if abs(SLOPE) >= 1000000:
                XINT = POINT1[0]
            elif SLOPE == 0.0:
                XINT = np.nan
            else:
                XINT = -1.0*YINT / SLOPE
        return SLOPE, YINT, XINT

    def compute_angle_given_three_points(self, VERTEX, POINT1, POINT2):
        A = np.array(POINT1)
        B = np.array(VERTEX)
        C = np.array(POINT2)
        BA = A - B
        BC = C - B
        s = np.arctan2(*BA)
        e = np.arctan2(*BC)
        return e-s
        
    def check_laterality(self, POINT1, POINT2, SLOPE, YINT, XINT): #TRUE IF TWO POINTS ARE ON THE SAME SIDE OF THE LINE.
        if abs(SLOPE) == np.inf:
            SIGN = (POINT1[0]-XINT)*(POINT2[0]-XINT)  #JUST COMPARE X VALUES TO X-INTERCEPT     
        else:
            SIGN = (SLOPE*POINT1[0] + YINT - POINT1[1])*(SLOPE*POINT2[0] + YINT - POINT2[1])
        
        if SIGN > 0:
            match = 1
        elif SIGN <= 0:
            match = 0
        return match
    
        
    def extend_vector(self, BACKPOINT, FRONTPOINT):
        delta_x, delta_y = (FRONTPOINT[0]-BACKPOINT[0]), (FRONTPOINT[1] - BACKPOINT[1])
        new_x = FRONTPOINT[0] + delta_x/abs(delta_x)*1000
        new_y = FRONTPOINT[1] + delta_y/abs(delta_y)*1000
        return (new_x, new_y)

    def angle_from_vertical(self, point1, point2):
        """
        RETURNS A VALUE IN DEGREES BETWEEN 0 AND 360, WHERE 0 AND 360 ARE NORTH ORIENTATION.
        """
        x = point1[0] - point2[0]
        y = point1[1] - point2[1]
        return 180.0 + math.atan2(x,y)*180.0/np.pi
from pathlib import Path
from itertools import chain, repeat
from pandas import DataFrame
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

corpus = DataFrame(columns=['is_negative', 'tokens'])

for is_negative, review_path in chain(
    zip(repeat(0), Path('aclImdb/train/pos').iterdir()),
    zip(repeat(1), Path('aclImdb/train/neg').iterdir()),
    zip(repeat(0), Path('aclImdb/test/pos').iterdir()),
    zip(repeat(1), Path('aclImdb/test/neg').iterdir()),
    ):
    with review_path.open(encoding='UTF-8') as review_file:
        file = str(review_path.relative_to('aclImdb'))
        tokens = word_tokenize(BeautifulSoup(review_file.read()).text)
        corpus.loc[file] = is_negative, tokens 
        print(len(corpus))

corpus.to_pickle('corpus.pkl')
        
Example #53
0
class LearnObject: 

    def __init__(self,FeatureObject,LabelsObject,LabelsObject2='notDefined'):
        self.FeaturesDF=FeatureObject.FeaturesDF
        self.LabelsObject=LabelsObject
        self.LabelsObject2=LabelsObject2
        self.Details={'LabelDetails':LabelsObject.LabelingDetails,'stratifiedKFold':FeatureObject.details,'FeatureMethod':FeatureObject.method,'PieceLength':FeatureObject.details['PieceLength']}
        self.BestFeatures={}
        self.N=LabelsObject.N
        self.model='notDefined'
        
    
    class BestFeaturesForLabel(): #class of the best features for certain Labeling method (PatientsVsContols, mentalStatus, PANSS, etc.)
        def __init__(self,FeatureTypeList,LabelingList,n_features):
            self.df=DF(np.zeros([len(FeatureTypeList),n_features]),index=MultiIndex.from_tuples(FeatureTypeList),columns=range(n_features))            
            
        def add(self,bestNfeatures): #adds a feature to best features list (length n_features)   
            BestFeaturesList=[j for j in bestNfeatures]
            FeatureTypeList=self.df.index
            for feature in FeatureTypeList:
                if feature in BestFeaturesList:
                    isFeature=1
                    FeatureLoc=BestFeaturesList.index(feature)
                    self.df.loc[feature][FeatureLoc] +=1 
                 
    """def analyzeFeaturesWeight(BestFeaturesDF,weights,ByLevel=0): #after having n features, this analyzes the wheighted mean of the use in each feature type. 
        df=BestFeaturesDF 
        #N=df.sum().sum()
        dfSum=df.sum(level=ByLevel)
        self.Mean=dfSum.sum(axis=1)
            
        weights=self.weights#[1.0/(x+1) for x in df.columns]            
        wSum=dfSum.mul(weights)
        wN=wSum.sum().sum()
        self.WeightedMean=wSum.sum(axis=1)/wN
        return WeightedMean""" 

        #TODO -> add analysis according to facial part (according to excel..)
            #TODO - > add analysis according to learning weights (and not 0.1 : 0.9)
                 
    def run(self,Model='svc',kernel='linear',is_cross_validation=True, cross_validationMethod='LOO', DecompositionMethod='PCA',decompositionLevel='FeatureType',n_components=30, FeatureSelection='TopExplainedVarianceComponents', n_features=10, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False,isSaveCsv=None, isSavePickle=None, isSaveFig=None,isSelectSubFeatures=False,SubFeatures='ExpressionLevel'):       
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel) 
        # --  # select best model (svm, otherwise ridge regression) 
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) 
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test) 
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init 
        if isSelectSubFeatures:
            print('Features : ' + SubFeatures)
            f=self.FeaturesDF.copy()
            featureNames=self.FeaturesDF.index.names
            try:
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([[SubFeatures],f.index], names=featureNames)
            except KeyError:
               f.index=f.index.swaplevel(0,1)
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([f.index,[SubFeatures]], names=featureNames)
            self.FeaturesDF=f.copy()
        else:
            SubFeatures='allFeatureTypes'

        FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)]
        self.FullResults=DF()
           
        # set learning params (cross validation method, and model for learning)
        isBoolLabel=self.LabelsObject.isBoolLabel
        isBoolScores=isBoolLabel
        if DecompositionMethod==None and (FeatureSelection == 'TopExplainedVarianceComponents' or FeatureSelection == 'TopNComponents'):
            print("ERROR- feature selection method cannot be '"+ FeatureSelection +"' when X is not decomposed")
            FeatureSelection=raw_input("Choose a different feature selection method ('RFE','f_regression','dPrime','AllFeatures'): ")

        model, isBoolModel= learningUtils.setModel(Model)
        selectFeatures =learningUtils.setFeatureSelection(FeatureSelection,n_features)
        n_components=min(n_features,n_features) #cannot have more components than features. 
        decompositionTitle, decomposeFunction= learningUtils.setDecomposition(DecompositionMethod,n_components,decompositionLevel)
        isDecompose=  decompositionTitle!='noDecomposition'


        # save learning params
        self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'Decomposition':decompositionTitle,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']}
        print('\n------------Learning Details------------')
        print(DF.from_dict(self.Learningdetails,orient='index'))
        print('\n----' + cross_validationMethod + ' Cross validation Results:----')
        
        #define global variables over modules (to be used in myUtils)

        globalVars.transformMargins=0#lambda x:x         
        globalVars.isBoolLabel=isBoolLabel
        globalVars.isBoolModel=isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects 
        trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject)
        trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2)

        
        LabelingList=trainLabels_all.columns #['N1']
        self.ResultsDF=DF()
        self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO 
       
                                              
        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList
        
        isMultivarLabels=False      
        LabelingIndex=enumerate(LabelingList)
        if isMultivarLabels:
            LabelingIndex=enumerate([LabelingList])

        for label_ind, Labeling in LabelingIndex:
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            #set subjects list according to labels and features
            X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling)
            X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1)
            
            #init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all)
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2)
            
            #make sure only labeled subjects are used for classification
            X=X.query('subject == '+ str(list(trainLabels.index)) ) 
            X.index.get_level_values(X.index.names[0]) 
            SubjectIndex=list(set(X.index.get_level_values('subject')))

            X2=X2.query('subject == '+ str(list(trainLabels2.index)) )  
            X2.index.get_level_values(X2.index.names[0]) 
            SubjectIndex2=list(set(X2.index.get_level_values('subject')))                       
            #init vars
            if isBetweenSubjects:
                cv_param=len(SubjectIndex)
                self.Learningdetails['CrossValSubjects']='between'
                isWithinSubjects=False
            else:
                isWithinSubjects=True
                X=X.swaplevel(0,1)
                PieceIndex=list(set(X.index.get_level_values('Piece_ind')))
                cv_param=len(PieceIndex)
                self.Learningdetails['CrossValSubjects']='within'
            
            self.Learningdetails['NumOfFeatures']=n_features
            
            
            try:
                print('\n**' + Labeling + '**')
            except TypeError:
                print('\n*******')
                print(Labeling)
            
            cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) 
            
            ## Learning - feature selection for different scoring types, with cross validation - 

            BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis
            cv_ind=0
            #used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores=DF()
            test_index=X.index
            testScores=concat([DF(index=test_index),DF(index=['std_train_err'])])
            testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) 
            testProbas=DF(index=X.index)
            testProbas2=DF(index=SubjectIndex)

            #impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange=LabelRange

            ModelWeights1=DF(columns=range(len(cv)),index=X.columns)
            Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning
            ExplainedVar=DF(columns=range(len(cv)))
            ModelWeights2=DF(columns=range(len(cv)))
            bestNfeaturesPanel=Panel(items=LabelingList,minor_axis=range(len(cv)),major_axis=range(n_features))
            
            #bestNfeaturesPanel=Panel(items=LabelingList,major_axis=range(len(cv)),minor_axis=MultiIndex.from_tuples(('a','b')))
            

            for train, test in cv:

                if not is_cross_validation:
                   train=np.append(train,test)
                   #test=np.append(train,test)
                   self.Learningdetails['CrossVal']='NONE'
                   #if cv_ind>0:
                    #    break

                if isBetweenSubjects:
                    #set X and Y
                    train_subjects=trainLabels.iloc[train].index
                    test_subjects=testLabels.iloc[test].index 
                    Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects)
                    Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects)

                    
                    if isConcatTwoLabels: #used when there is more than one doctor
                        Xtrain=concat([Xtrain,Xtrain2])
                        Xtest=concat([Xtest,Xtest2])
                        Ytrain=concat([Ytrain,Ytrain2])
                        YtrainTrue=concat([YtrainTrue,YtrainTrue2])
                        Ytest=concat([Ytest,Ytest2])
                        Xdropped=concat([Xdropped,Xdropped2])
                        SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test.
                 

                    #select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar = learningUtils.decomposeAndSelectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeatures,decomposeFunction)
                    BestFeaturesForLabel.add(bestNfeatures) #todo - delete this??  
                    bestNfeaturesPanel[Labeling][cv_ind]=bestNfeatures   
                    """for  feature_ind,feature_name in enumerate(bestNfeatures):
                         
                         try:
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind
                        except KeyError:
                            bestNfeaturesPanel[Labeling].columns=bestNfeaturesPanel[Labeling].columns.append(feature_name)#continue here!! use 
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind



                    [bestNfeatures].iloc[cv_ind]=range(len(bestNfeatures))"""
                    #train 1 
                    TrainModel=model
                    TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index())
                    """try:
                        #Components[cv_ind]=components.T
                        #ExplainedVar[cv_ind]=explainedVar
                        isDecompose=True"""
                    if cv_ind==0:
                        ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures)))    
                    ModelWeights1[cv_ind]=TrainModel.coef_.flatten()
                  
                    #get ROC scores without cross validation:
                                           
                    #train 2
                    if isBoolLabel:
                       PiecePrediction_train=DF(TrainModel.predict_proba(Xtrain).T[1],index=Xtrain.index,columns=['prediction'])
                       TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1})
                    else:
                       PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel)                 
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind==0:
                        ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns)
                    ModelWeights2[cv_ind]=TrainModel2.coef_.flatten()         

                              
                    #test 1
                    if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index)
                        XtestDropped=dXdropped[bestNfeatures]
                        YtestDropped=Series(XtestDropped.copy().icol(0))
                        #YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject]=testLabels_all[Labeling].loc[subject]
                            #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest=concat([Ytest,YtestDropped]).sort_index()
                        Xtest=concat([Xtest,XtestDropped]).sort_index()


                    if isPerm: #TODO- Check this!!
                        Ytest=y_perms.loc[Ytest.index]
                    Xtest=Xtest.fillna(0.)
                    
                    
                elif isWithinSubjects:
                    #train 1
                    train_pieces=PieceIndex[train]
                    test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process
                    
                    XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces)))
                    Ytrain=Series(index=X.index)
                    Ytest=Series(index=X.index)
                    YtrainTrue=Series(index=X.index)
                    
                    for subject in PieceIndex: 
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject]=trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] 
                            Ytest.loc[piece].loc[subject]=testLabels[subject]   
                    Ytrain=Ytrain.dropna()
                    YtrainTrue=YtrainTrue.dropna() 
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject]=testLabels[subject]
                #train scores 1       
                if cv_ind==0:
                    trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    plt.figure(1)
                    if len(LabelingList)>1:
                        plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    if isBoolLabel:
                        testScores,testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    else:
                        testScores[cv_ind],testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                        plt.title(Labeling,fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    trainScores=concat([trainScores,new_trainScores],axis=1)
                #test 1   
                    testScores[cv_ind],testProbas_new=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    testProbas=concat([testProbas,testProbas_new])
                
                #train2

                if isBoolLabel:
                    PiecePrediction_test=DF(TrainModel.predict_proba(Xtest).T[1],index=Xtest.index,columns=['prediction'])
                else:
                    PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction'])
                Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test,isBoolLabel)
                
                if cv_ind==0:
                    trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    #plt.figure(1)
                    #if len(LabelingList)>1:
                        #plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                #test2
                    if isBoolLabel:
                        testScores2,testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    else:
                        testScores2[cv_ind],testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    #plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    trainScores2=concat([trainScores2,new_trainScores2],axis=1)
                    if len(Xtest2)>0: # if there is more than one segment for subject
                        testScores2[cv_ind],testProbas2_new=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)     
                        testProbas2=concat([testProbas2,testProbas2_new])
                cv_ind+=1

                #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. 
            fig2=plt.figure(2)
            if len(LabelingList)>1:
                plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
            #if isAddDroppedSubjects:
               # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
           # else:
               # testLabelsSummary=testLabels
            scoresSummary,rocDF = learningUtils.getScoresSummary(trainScores2,testScores2,testProbas2,TrueLabels[Labeling])

            # reset global vars
            globalVars.fitYscale='notDefined'
            globalVars.beta=DF()

            plt.title(Labeling,fontsize=10)
            plt.xlabel('Ytrue',fontsize=8)
            plt.ylabel('Ypredicted',fontsize=8)
            plt.tick_params(labelsize=6)
            #print(crossValScores.T)    
            scores=scoresSummary.fillna(0.)
            
            #analyze feature weights             
            ModelWeights1=ModelWeights1.dropna(how='all')
            WeightedFeatures1_index0=analysisUtils.getFeaturesWeights(0,bestNfeaturesPanel[Labeling],ModelWeights1) #FeatureAnalysisIndex=0 for featureType, 1= au's (if not decomposed) or component rank (if decomposed)
            WeightedFeatures1_index1=analysisUtils.getFeaturesWeights(1,bestNfeaturesPanel[Labeling],ModelWeights1)
            WeightedFeatures1=concat([DF(index=['-------(A) Index0-------']),WeightedFeatures1_index0,DF(index=['-------(B) Index1 -------']),WeightedFeatures1_index1])
            
            WeightedFeatures2=DF(ModelWeights2.mean(axis=1)).fillna(0)
            #WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0)
            BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2])
            self.BestFeatures[Labeling]=Series(BestFeatures.values.flatten(),index=BestFeatures.index)

            #analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                normalize=lambda df:DF(StandardScaler().fit_transform(df.T).T,index=df.index,columns=df.columns) 

                """#componentsMeanFeatureType=normalize(Components.mean(axis=1,level='FeatureType'))
                #componentsMeanFeatureTypeABS=normalize(componentsDF.abs().mean(axis=1,level='FeatureType'))
                #componentsMeanFSsignal=normalize(componentsDF.mean(axis=1,level='fs-signal'))
                #componentsMeanFSsignalABS=normalize(componentsDF.abs().mean(axis=1,level='fs-signal'))
                #ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check!
                #ExplainedVar_mean.index=['ExplainedVar_mean']
                #ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check!
                #ExplainedVar_std.index=['ExplainedVar_std']
                #componentsToCSV=concat([DF(index='---meanFeatureType----'),componentsMeanFeatureType,DF(index='---meanFeatureType - abs ----'),componentsMeanFeatureTypeABS,DF(index='---mean fs-signal ----'),componentsMeanFSsignal,DF(index='---mean fs-signal - abs ----'),componentsMeanFSsignalABS])
                try:
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])
                except AttributeError:
                    self.LabelComponents=dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])"""

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

                        
            #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults=concat([DF(index=[Labeling]),scores]) 
  
            self.FullResults=concat([self.FullResults,LabelFullResults])            
            self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1)

            #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

            #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score...
        FullSubjectsList=YpredictedOverAllLabels[0].columns
        YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all')
        YdroppNans=YdroppNans.dropna(axis=1,how='all')
        YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all')
        notNans_cv_ind=YpredictedOverAllLabels.items
        notNans_trainSubjects=YpredictedOverAllLabels.minor_axis
        notNans_LabelsList=YpredictedOverAllLabels.major_axis
        notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind=0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train)>0 and len(test)>0: 
                    AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue=notNans_TrueLabels[train]
                    AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue=notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns)
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!!
                TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix)
                #testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3)
            cv_ind+=1

        self.BestNFeaturesAll=bestNfeaturesPanel 
        self.ResultsDF=self.ResultsDF.fillna(0.)  
        
        ## Print and save results  
        print('\n')
        print(self.ResultsDF)
        print('\n')
        D=self.Learningdetails 
        savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+ '_FSelection'+FeatureSelection+'_Decompostion'+D['Decomposition']+'PieceSize'+D['PieceLength']+'_'+SubFeatures
        if isPerm:
            savePath=savePath+'_PERMStest'
        saveName=savePath+'\\'+str(n_features)+'_features'        
        self.Learningdetails['saveDir']=savePath
        dir=os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle=int(raw_input('Save Results to pickle? '))
        if isSaveCsv is None:
            isSaveCsv= int(raw_input('save Results to csv? '))
        if isSaveFig is None:
            isSaveFig=int(raw_input('save Results to figure? '))

       
        if isSavePickle:        
            self.ResultsDF.to_pickle(saveName+'.pickle')
            self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle')
                
        if isSaveCsv:
            DetailsDF=DF.from_dict(self.Learningdetails,orient='index')
            ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures])
            ResultsCSV.to_csv(saveName+'.csv')
            if isBoolLabel:
                ROCfig=learningUtils.save_plotROC(rocDF,isSave=True,saveName=saveName,title=SubFeatures)

        if isSaveCsv or isSavePickle:
            print('successfully saved as:\n' + saveName)
        
        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + 'Train.png')
            plt.figure(2)
            plt.savefig(saveName + 'Test.png')
        plt.close()
        plt.close()
Example #54
0
theta = 0.00001
softmax = False
skip = True

if not skip:
    game = pg.PredatorGame((0,0), (5,5), (11,11))

    notused, rmse['Sarsa'] = sarsaresults(samples, episodes, discount, epsilon, alpha, initValue, softmax, theta)
    notused, rmse['Q-learning'] = qlearningresults(samples, episodes, discount, epsilon, alpha, initValue, softmax, theta)
    notused, rmse['Q-learning with SoftMax'] = qlearningsoftmaxresults(samples, episodes, discount, tau, alpha, initValue, softmax, theta)
    notused, notused, rmse['On Policy Monte Carlo'] = montecarloOnPolicyresults(samples, episodes, discount, epsilon, 0, theta)
    notused, notused, notused, rmse['Off Policy Monte Carlo'] = montecarloOffPolicyresults(samples, episodes, discount, epsilon, 0, theta)

    rmse['episode'] = range(0,episodes)
    dataF = DataFrame(rmse)
    dataF.to_pickle('data/rmse'+str(episodes))
else:
    dataF = pd.read_pickle('data/rmse'+str(episodes))

episodeData = pd.melt(dataF, id_vars=['episode'], var_name='Learning algorithm')
# for key, value in rmse.items():
#     plt.figure()
#     plt.plot(value, 'b')
#     plt.xlabel('Episodes')
#     plt.ylabel('Root Mean Square Error ('+key+')')
#     plt.legend()

# plt.show()   

p = ggplot(episodeData, aes('episode', 'value', color='Learning algorithm')) +\
    geom_line() +\
N = 1
if not skip:
	for i in range(N):
		print i

		averageQ, predwinsratioQ = getIndependentQLearning()

		averageS, predwinsratioS = getIndependentSarsa()

	data['IndependentQLearning'] = predwinsratioQ
	data['IndependentSarsa'] = predwinsratioS
	data['episode'] = range(1,episodes+1)

	dataF = DataFrame(data)
	dataF.to_pickle('data/comparison')
else:
    dataF = pd.read_pickle('data/comparison')
       
for a in alg:
	dataF[a] = scipy.ndimage.filters.gaussian_filter(dataF[a],5*(episodes/4000),0)
    

episodeData = pd.melt(dataF, id_vars=['episode'], var_name='Algorithm')

p = ggplot(episodeData, aes('episode', 'value', color='Algorithm')) +\
	 geom_line() +\
     theme_bw() + theme() + ylab("Win ratio") + xlab("Episodes")
print p
ggsave(p, "plots/comparison.png")
ggsave(p, "plots/comparison.pdf")
Example #56
0
class ZopeRequestPlotter(object):

	def __init__(self, requests):
		self.requests = requests
		self.df = DataFrame(self.requests)
		self.df.to_pickle('data_frame.pickle')
		#Normalize Timestamp to hours
		min_timestamp_row = self.df.ix[self.df['timestamp'].idxmin()]
		min_timestamp_value = min_timestamp_row['timestamp']
		self.min_timestamp = min_timestamp_row['timestamp_text']
		self.max_timestamp = self.df.ix[self.df['timestamp'].idxmax()]['timestamp_text']
		self.df['timestamp'] = (self.df['timestamp'] - min_timestamp_value) / 3600

	def plot_call_summary(self):
		""" """
		# Gets the most expensive 1000 calls
		top_n_expensive_calls = self.df.sort_index(by='elapsed', ascending=False)[:1000]
		calls = [ call  for call, group in top_n_expensive_calls.groupby('call') ]

		data = []
		for call in calls:
			call_info = self.df[self.df['call']==call]['elapsed']
			call_data = {}
			call_data['call'] = call
			call_data['mean'] = call_info.mean()
			call_data['count'] = call_info.count()
			call_data['max'] = call_info.max()
			call_data['min'] = call_info.min()
			data.append(call_data)

		call_data_df = DataFrame(data)

		fig = plt.figure()
		fig.suptitle('{0}   -   {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16)

		ax = fig.add_subplot(3, 1, 1)
		#ax.get_xaxis().set_visible(False)
		self.df[self.df['elapsed']>=5].sort_index(by='timestamp').plot(title='Response time > 5 seconds', ax=ax, x='timestamp', y='elapsed')

		ax = fig.add_subplot(3, 1, 2)
		call_data_df[['call', 'min', 'max', 'mean']].set_index('call').plot(title='Response Time', ax=ax, kind='barh')

		ax = fig.add_subplot(3, 1, 3)
		call_data_df.plot(title='Call Count', ax=ax, x='call', y='count', kind='barh')

		fig.show()

		self.plot_calls_distribution(calls)

	def plot_calls_distribution(self, calls_to_plot):

		# plots call distribution for 2 calls
		#calls_to_plot = [ 'EventsRouter.query', 'EventsRouter.queryArchive', 'MessagingRouter.setBrowserState' ]
		#calls_to_plot = [ 'IncidentManagementRouter.runNotification', 'IncidentManagementRouter.associateIncident', 'EventsRouter.queryArchive' ]

		fig = plt.figure()
		fig.suptitle('{0}   -   {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16)

		graph_rows = 4
		graph_cols = len(calls_to_plot)/graph_rows
		if len(calls_to_plot)%graph_rows != 0:
			graph_cols = graph_cols + 1

		plot_n = 1
		for call in calls_to_plot:
			data = self.df[self.df['call']==call]
			ax = fig.add_subplot(graph_rows, graph_cols, plot_n)
			data.plot(title=call, ax=ax, x='timestamp', y='elapsed', style='.', fontsize=10)
			plot_n = plot_n + 1

		fig.show()

	def plot_user_call_data(self, fmean=False, fcount=False):
		function = None
		if fmean:
			function = mean
		else:
			function = count

		if function:
			fig = plt.figure()
			# call analysis (mean)
			graph_rows = 3
			graph_cols = 1
			plot_n = 1

			# top 3 users data
			top_users = users_call_count.index[:3]
			for top_user in top_users:
				#top_user = users_call_count.index[0]
				top_user_calls = self.df[ self.df.user == top_user ]
				ax = fig.add_subplot(graph_rows, graph_cols, plot_n)
				top_user_calls_count = top_user_calls.groupby('call').call.function()
				top_user_calls_count.sort()
				top_user_calls_count.plot(title='Call count for {0}'.format(top_user), ax = ax, kind='barh')
				plot_n = plot_n + 1
			fig.show()

	def plot_user_data(self):

		# call analysis per user (count)

		users_call_count = self.df.groupby('user')['timestamp'].count()
		users_call_count.sort(ascending=False)

		count_fig = plt.figure()
		count_fig.suptitle('{0}   -   {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16)
		graph_rows = 4
		graph_cols = 1
		plot_n = 1

		ax = count_fig.add_subplot(graph_rows, graph_cols, plot_n)
		users_call_count[:10].plot(title='Top 10 users. Number of calls', ax = ax, kind='barh')
		plot_n = plot_n + 1

		top_users = users_call_count.index[:3]
		for top_user in top_users:
			top_user_calls = self.df[ self.df.user == top_user ]
			ax = count_fig.add_subplot(graph_rows, graph_cols, plot_n)
			top_user_calls_count = top_user_calls.groupby('call').call.count()
			top_user_calls_count.sort()
			top_user_calls_count.plot(title='Call count for {0}'.format(top_user), ax = ax, kind='barh')
			plot_n = plot_n + 1

		count_fig.show()

		# call analysis per user (mean)

		mean_fig = plt.figure()
		mean_fig.suptitle('{0}   -   {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16)
		users_call_mean = self.df.groupby('user')['elapsed'].mean()
		users_call_mean.sort(ascending=False)

		graph_rows = 4
		graph_cols = 1
		plot_n = 1

		ax = mean_fig.add_subplot(graph_rows, graph_cols, plot_n)
		users_call_mean[:10].plot(title='Top 10 users: mean elapsed time per call', ax = ax, kind='barh')
		plot_n = plot_n + 1

		# top 3 users data
		top_users = users_call_mean.index[:3]
		for top_user in top_users:
			top_user_calls = self.df[ self.df.user == top_user ]
			ax = mean_fig.add_subplot(graph_rows, graph_cols, plot_n)
			top_user_calls_count = top_user_calls.groupby('call').elapsed.mean()
			top_user_calls_count.sort()
			top_user_calls_count.plot(title='Call mean for {0}'.format(top_user), ax = ax, kind='barh')
			plot_n = plot_n + 1
		mean_fig.show()
		

	def plot_archive_calls(self):

		archive_calls = self.df[ self.df.call == 'EventsRouter.queryArchive' ]
		archive_calls_count = archive_calls.groupby('user')['elapsed'].count()
		archive_calls_count.sort(ascending=False)

		archive_fig = plt.figure()

		archive_fig.suptitle('{0}   -   {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16)

		graph_rows = 2
		graph_cols = 1
		plot_n = 1

		# Archive call count per user
		ax = archive_fig.add_subplot(graph_rows, graph_cols, plot_n)
		archive_calls_count.plot(title='Archive call count per user', ax = ax, kind='barh')
		plot_n = plot_n + 1
		'''
		# Archive call mean per user
		ax = archive_fig.add_subplot(graph_rows, graph_cols, plot_n)
		archive_calls_mean = archive_calls.groupby('user')['elapsed'].mean()
		archive_calls_mean.plot(title='Archive call mean elapsed time per user', ax = ax, kind='barh')
		plot_n = plot_n + 1
		'''
		# Archive call distribution for user with more calls to archive
		user_pegging_archive = archive_calls_count.index[0]

		pegger_df = archive_calls[archive_calls.user=='zec'][['elapsed', 'timestamp']]
		pegger_df.sort_index(by='timestamp')
		ax = archive_fig.add_subplot(graph_rows, graph_cols, plot_n)
		pegger_df.plot(title='Top archive user call distribution vs elapsed time', ax=ax, x='timestamp', y='elapsed', style='.', fontsize=10)

		archive_fig.show()

	def plot_zec_user_calls(self):

		zec_calls = self.df[ self.df.user == 'zec' ]
		zec_calls_count = zec_calls.groupby('call')['elapsed'].count()
		zec_calls_count.sort(ascending=False)

		# Call count
		fig = plt.figure()
		fig.suptitle('{0}   -   {1}'.format(self.min_timestamp, self.max_timestamp), fontsize=16)

		ax = fig.add_subplot(2, 1, 1)
		zec_calls_count.plot(title='Zec User calls', ax = ax, kind='barh')

		# Call Distribution
		data_to_plot = DataFrame()
		for call, group in zec_calls.groupby('call'):
			data_to_plot = data_to_plot.append(group[['call','elapsed','timestamp']])

		ax = fig.add_subplot(2, 1, 2)

		data_to_plot.plot(title='Zec user call distribution vs elapsed time', ax=ax, x='timestamp', y='elapsed', style='.', fontsize=10)
		
		fig.show()


	def plot_requests_info(self):

		self.plot_call_summary()
		self.plot_user_data()
		#self.plot_archive_calls()
		self.plot_zec_user_calls()
Example #57
0
        for epsilon in parametersFor(category):
            results[epsilon] = getResults(samples, episodes, discount, epsilon, decay)
    elif category == 'decay':
        for decay in parametersFor(category):
            results[decay] = getResults(samples, episodes, discount, epsilon, decay)
    elif category == 'discount':
        for discount in parametersFor(category):
            print(discount)
            results[discount] = getResults(samples, episodes, discount, epsilon, decay)
    else:
        sys.exit()
    print(results)
    results['episode'] = range(1,episodes+1)

    dataF = DataFrame(results)
    dataF.to_pickle('data/'+str(episodes)+category+"small")
    #pickle.dump(randomReturnValues, open('data/values'+str(episodes)+category+str(softmax), 'w+'))
else:
    dataF = pd.read_pickle('data/'+str(episodes)+category)
    #randomReturnValues = pickle.load(open('data/values'+str(episodes)+category+str(softmax), 'r+'))

print dataF
if smoothing:
    for par in parametersFor(category):
        dataF[par] = scipy.ndimage.filters.gaussian_filter(dataF[par],5*(episodes/4000),0)
episodeData = pd.melt(dataF, id_vars=['episode'], var_name=category)

ylabel = "Steps"

p = ggplot(episodeData, aes('episode', 'value', color=category)) +\
    geom_line(alpha=0.6) +\
Example #58
0
    if category == "epsilon":
        for epsilon in [0.05, 0.1, 0.3, 0.9]:
            results[epsilon], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
    elif category == "alpha":
        for alpha in [0.1, 0.2, 0.3, 0.6, 1]:
            results[alpha], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
    elif category == "discount":
        for discount in [0.1, 0.4, 0.7, 0.8, 0.9]:
            print discount
            results[discount], avgRMS = getResults(samples, episodes, discount, epsilon, alpha, initValue, softmax)
    else:
        sys.exit()
    print results
    results["episode"] = range(0, episodes)
    dataF = DataFrame(results)
    dataF.to_pickle("data/" + category + str(softmax))
else:
    dataF = pd.read_pickle("data/" + category + str(softmax))

episodeData = pd.melt(dataF, id_vars=["episode"], var_name=category)


p = (
    ggplot(episodeData, aes("episode", "value", color=category))
    + geom_line()
    + theme_bw()
    + theme()
    + ylab("Steps")
    + xlab("Episodes")
    + ylim(0, 60)
)
    "avgICDMPaperCount",
    "maxICDMPaperCount",
    "primaryICDMPaperCount",
    "maxConnectivity",
    "maxPageRank",
    "maxDegCentrality",
    "numAuthors"
]

for i in toLog:
    il = i+"Log"
    df[il] = df[i]
    df.loc[df[il] == 0, il] = .1
    df.loc[:, il] = np.log(df.loc[:, il])

df.to_pickle("savedFrames/predictionFeatures/paperTable")


print "Constructing Review Table"
i = 0
reviewTable = []

for id, review in loader.reviews.iteritems():
    paper = review.paper
    reviewer = review.user

    reviewTable.append({
        "paperId": paper.id,
        "userId": reviewer.id,

        "rating": review.overallRating,
Example #60
-1
def process_matebook_data(directory, paramlist, storage_location):
    vidname = parse_screen_filename(directory)
    for filename in find_files(directory, 'track.tsv'):
        vidpath, flyID = parse_filename(filename)
        tag = vidname + "_" + flyID
        if not os.path.exists(storage_location + '/' + tag + '_arena.pickle'):
            fi = pd.read_table(filename, sep='\t', header = [0,1], skiprows=[2,3])
            tempdf = DataFrame(index = fi.index)
            if fi['Unnamed: 8_level_0', 'isMissegmented'].mean() >= 0.2:
                print "arena dropped for poor quality: ", tag
                continue
            elif fi['Unnamed: 8_level_0', 'isMissegmented'].mean() == 0.0:
                print "arena dropped because quality = 1: ", tag
                continue
            elif len(set(fi['Unnamed: 3_level_0', 'courtship'])) <=1:
                print "arena dropped because courtship = nan: ", tag
                continue
            else:
                for j in paramlist:
                    tempdf[j[1]] = fi[j[0],j[1]]
                    if 'movedAbs_u' in j:
                        tempdf[j[1]] = tempdf[j[1]] * FPS
            tempdf['Time'] = tempdf.index/FPS
            time_ID = vidpath.split('_',1)[-1].split('.',1)[0]
            tempdf = merge_jvision_data(tempdf.reset_index(), time_ID)
            tempdf.to_pickle(storage_location + '/'+ tag + '_arena.pickle')
            print ".....", tag, " processed to pickling."
    return